In [1]:
import pandas as pd

In [None]:
df = pd.read_excel('./my_data/train/deepfacelab.xlsx')

### Add Length of title and description

In [None]:
df.head(1)

Unnamed: 0,fullname,number,html_url,title,description,labels
0,iperov/DeepFaceLab,1,https://github.com/iperov/DeepFaceLab/issues/1,"`Error: Sorry, this model works only on 2GB+ G...",## Expected behavior_x000D_\nStart training._x...,other


In [None]:
def _strlen(sentence):
    import re
    zhmodel = re.compile(u'[\u4e00-\u9fa5]')  # 检查中文
    contents = str(sentence)
    match = zhmodel.search(contents)
    if match:
        return -1
    else:
        return len(str(sentence).strip().split(' '))

title_lens, body_lens = df['title'].apply(_strlen), df['description'].apply(_strlen)

In [None]:
cols = df.columns
df.insert(loc=df.columns.get_loc("title") + 1, column='title_lens', value=title_lens)
df.insert(loc=df.columns.get_loc("description") + 1, column='description_lens', value=body_lens)

In [None]:
df.head(1)

Unnamed: 0,fullname,number,html_url,title,title_lens,description,description_lens,labels
0,iperov/DeepFaceLab,1,https://github.com/iperov/DeepFaceLab/issues/1,"`Error: Sorry, this model works only on 2GB+ G...",15,## Expected behavior_x000D_\nStart training._x...,375,other


### Remove nan Label and short title with no description

In [None]:
df['labels'].unique()

array(['other', 'Low efficiency and Effectiveness', 'deployment', 'Error',
       'tensor&inputs'], dtype=object)

In [None]:
contain_na = df[df['labels'].isnull()]
contain_na.head(3)

Unnamed: 0,fullname,number,html_url,title,title_lens,description,description_lens,labels


In [None]:
len_before = len(df)
df.dropna(subset = ['labels'], inplace=True)
len_after = len(df)
print(f'before:{len_before}, after:{len_after}')

before:716, after:716


In [None]:
contain_chinese_characters = df[((df.title_lens == -1) & (df.description_lens == -1))]
df = df.drop(contain_chinese_characters.index)

In [None]:
body_nan = df[((df.title_lens == 1) & (df.description.isnull()))]
df = df.drop(body_nan.index)

In [None]:
df = df.sort_values(by=['title_lens', 'description_lens'])

# save the new file

In [None]:
name = '_'.join(df['fullname'][0].split('/'))
name

'iperov_DeepFaceLab'

In [None]:
if not os.path.exists(os.path.join('./my_data', name)):
    os.mkdir(os.path.join('./my_data', name))

In [None]:
df.to_excel(f'{name}.xlsx')

In [None]:
import json
result = df[['title', 'description', 'labels']].to_json(orient="records")
parsed = json.loads(result)

with open(f'{name}.txt', 'w') as f:
    json.dump(parsed, f)

In [None]:
data = []
with open('deepfakes_faceswap.txt', 'r', encoding='utf-8') as f:
    data = json.load(f)

for obj in data:
    print(obj)
    break

{'title': 'setUpfailed', 'description': '![default](https://user-images.githubusercontent.com/28338863/53715027-7e309480-3e8b-11e9-8d5b-aebb44e63f9e.png)\r\n\r\ndear editor,what does these mean?', 'labels': 'other'}


## 组合label文件和comments文件

In [None]:
import pandas as pd

# 读取Excel文件
df1 = pd.read_excel('./issue_data/issue_xlxs/EasyOCR_issue_classify_2024_01_01.xlsx')  # 包含comment_concat_str列
df2 = pd.read_excel('./issue_data/issue_newlabel/EasyOCR_newlabel.xlsx')  # 要把数据合并到这个文件

# 基于'number'列进行合并，使用left join以保留df2中所有行
merged_df = pd.merge(df2, df1[['number', 'commment_concat_str']], on='number', how='left')

# 把合并后的DataFrame保存为新的Excel文件, 如果想要替换file2.xlsx，直接覆盖即可
merged_df.to_excel('./issue_data/issue_newlabel_comments/EasyOCR_newlabel_with_comments.xlsx', index=False)

## Combine the code

In [1]:
import os
import glob
import json
import pandas as pd

In [2]:
def _strlen(sentence):
    import re
    zhmodel = re.compile(u'[\u4e00-\u9fa5]')  #检查中文
    contents = str(sentence)
    match = zhmodel.search(contents)
    if match:
        return -1
    else:
        return len(str(sentence).strip().split(' '))

def concatenate_df_and_save(dfs):
    df = pd.concat(dfs)
    dir_name = os.path.join('./my_data/train', 'concat')
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
    
    out_path = os.path.join(dir_name, 'concat.txt')
    result = df[['title', 'description', 'labels']].to_json(orient="records")
    parsed = json.loads(result)
    with open(out_path, 'w') as f:
        json.dump(parsed, f)

def save_df(df, filename, stage="train"):
    if stage == "train":
        dir_name = os.path.join('./my_data/train', filename)
    elif stage == "test":
        dir_name = os.path.join('./my_data/test', filename)
    elif stage == "valid":
        dir_name = os.path.join('./my_data/valid', filename)

    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
    
    out_path = os.path.join(dir_name, filename + '.txt')
    if 'commment_concat_str' in df.columns:
        # result = df[['title', 'description', 'commment_concat_str', 'labels']].to_json(orient="records")
        result = df[['number', 'html_url', 'title', 'description', 'commment_concat_str', 'labels']].to_json(orient="records")
    else:
        result = df[['number', 'html_url', 'title', 'description', 'labels']].to_json(orient="records")
    parsed = json.loads(result)
    with open(out_path, 'w') as f:
        json.dump(parsed, f)
    
def preprocess(file):
    df = pd.read_excel(file)
    # add title and description length
    title_lens, description_lens = df['title'].apply(_strlen), df['description'].apply(_strlen)

    df.insert(loc=df.columns.get_loc("title") + 1, column='title_lens', value=title_lens)
    df.insert(loc=df.columns.get_loc("description") + 1, column='description_lens', value=description_lens)
    
    name = '_'.join(df['fullname'][0].split('/'))
    print(f'before drop, the length of {name} is:{len(df)}')
    
    # drop the nan
    df.dropna(subset = ['title', 'description', 'labels'], inplace=True)
    
    # drop the chinese issues
    contain_chinese_characters = df[((df.title_lens == -1) & (df.description_lens == -1))]
    df.drop(contain_chinese_characters.index, inplace=True)
    
    print(f'after drop, the length of {name} is:{len(df)}')

    # whether or not to convert erery xlxs file to json file
    
    # name = '_'.join(df['fullname'][0].split('/'))
    # dir_name = os.path.join('./my_data', name)
    # if not os.path.exists(dir_name):
    #     os.mkdir(dir_name)
    
    # out_path = os.path.join(dir_name, f'{name}.txt')
    # result = df[['title', 'description', 'labels']].to_json(orient="records")
    # parsed = json.loads(result)

    # with open(out_path, 'w') as f:
    #     json.dump(parsed, f)
    
    return df
    
def preprocess_vote(file, group_num=3):
    df = pd.read_excel(file)
    # add title and description length
    title_lens, description_lens = df['title'].apply(_strlen), df['description'].apply(_strlen)

    df.insert(loc=df.columns.get_loc("title") + 1, column='title_lens', value=title_lens)
    df.insert(loc=df.columns.get_loc("description") + 1, column='description_lens', value=description_lens)
    
    name = '_'.join(df['fullname'][0].split('/'))
    print(f'before drop, the length of {name} is:{len(df)}')

    # 标记需要删除的行
    rows_to_drop = set()
    for i in range(0, len(df), group_num):
        group = df.iloc[i:i+group_num]
        print(group)
        print(f'row {i} group.isnull().any(axis=1).any() is {group.isnull().any(axis=1).any()}')
        if group.isnull().any(axis=1).any() or ((group['title_lens'] == -1) & (group['description_lens'] == -1)).any():
            rows_to_drop.update(group.index)

    # 删除标记的行
    df.drop(rows_to_drop, inplace=True)
    
    print(f'after drop, the length of {name} is:{len(df)}')
    return df

### Concat train file

In [3]:
# dfs = []
# dfs.append(preprocess('my_data/train/pytorch-CycleGAN-and-pix2pix_TRAIN_Aug.xlsx'))
# dfs.append(preprocess('my_data/train/Real-Time-Voice-Cloning_TRAIN_Aug.xlsx'))
# dfs.append(preprocess('my_data/train/EasyOCR_TRAIN_Aug.xlsx'))
# dfs.append(preprocess('my_data/train/recommenders1_TRAIN_Aug.xlsx')) 
# # dfs.append(preprocess('my_data/train/streamlit1_TRAIN_Aug.xlsx'))
# concatenate_df_and_save(dfs)

In [3]:
save_df(preprocess('my_data/train/caffe_pytorch_newlabel_clean_TRAIN_Aug.xlsx'), 'caffe_pytorch_newlabel_clean_TRAIN_Aug')
save_df(preprocess('my_data/valid/caffe_pytorch_newlabel_clean_VALID_Aug.xlsx'), 'caffe_pytorch_newlabel_clean_VALID_Aug', "valid")
save_df(preprocess('my_data/test/caffe_pytorch_newlabel_clean_TEST_Aug.xlsx'), 'caffe_pytorch_newlabel_clean_TEST_Aug', "test")

before drop, the length of CMU-Perceptual-Computing-Lab_openpose is:8680
after drop, the length of CMU-Perceptual-Computing-Lab_openpose is:8666
before drop, the length of CMU-Perceptual-Computing-Lab_openpose is:656
after drop, the length of CMU-Perceptual-Computing-Lab_openpose is:653
before drop, the length of dusty-nv_jetson-inference is:1094
after drop, the length of dusty-nv_jetson-inference is:1090


In [5]:
save_df(preprocess('my_data/train/framework_newlabel_clean_TRAIN_Aug.xlsx'), 'framework_newlabel_clean_TRAIN_Aug')
save_df(preprocess('my_data/valid/framework_newlabel_clean_VALID_Aug.xlsx'), 'framework_newlabel_clean_VALID_Aug', "valid")
save_df(preprocess('my_data/test/framework_newlabel_clean_TEST_Aug.xlsx'), 'framework_newlabel_clean_TEST_Aug', "test")

before drop, the length of deezer_spleeter is:14445
after drop, the length of deezer_spleeter is:14428
before drop, the length of streamlit_streamlit is:1328
after drop, the length of streamlit_streamlit is:1325
before drop, the length of streamlit_streamlit is:1934
after drop, the length of streamlit_streamlit is:1929


In [5]:
# save_df(preprocess('my_data/train/pytorch-CycleGAN-and-pix2pix_TRAIN_Aug.xlsx'), 'pytorch-CycleGAN-and-pix2pix_TRAIN_Aug')
# save_df(preprocess('my_data/train/Real-Time-Voice-Cloning_TRAIN_Aug.xlsx'), 'Real-Time-Voice-Cloning_TRAIN_Aug')
# save_df(preprocess('my_data/train/EasyOCR_TRAIN_Aug.xlsx'), 'EasyOCR_TRAIN_Aug')
# save_df(preprocess('my_data/train/recommenders1_TRAIN_Aug.xlsx'), 'recommenders1_TRAIN_Aug') 
# save_df(preprocess('my_data/train/streamlit1_TRAIN_Aug.xlsx'), 'streamlit1_TRAIN_Aug')

# save_df(preprocess('my_data/train/streamlit_clean_TRAIN_Aug.xlsx'), 'streamlit_clean_TRAIN_Aug')
# save_df(preprocess('my_data/test/streamlit_clean_TEST_Aug.xlsx'), 'streamlit_clean_TEST_Aug', False)

# streamlit newlabel
# save_df(preprocess('my_data/train/streamlit_new_clean_TRAIN_Aug.xlsx'), 'streamlit_new_clean_TRAIN_Aug')
# save_df(preprocess('my_data/valid/streamlit_new_clean_VALID_Aug.xlsx'), 'streamlit_new_clean_VALID_Aug', "valid")
# save_df(preprocess('my_data/test/streamlit_new_clean_TEST_Aug.xlsx'), 'streamlit_new_clean_TEST_Aug', "test")

#  newlabel Real-Time-Voice-Cloning_newlabel_clean_TRAIN_Aug
save_df(preprocess('my_data/train/pytorch_newlabel_clean_TRAIN_Aug.xlsx'), 'pytorch_newlabel_clean_TRAIN_Aug')
save_df(preprocess('my_data/valid/pytorch_newlabel_clean_VALID_Aug.xlsx'), 'pytorch_newlabel_clean_VALID_Aug', "valid")
save_df(preprocess('my_data/test/pytorch_newlabel_clean_TEST_Aug.xlsx'), 'pytorch_newlabel_clean_TEST_Aug', "test")

save_df(preprocess('my_data/train/tensorflow_newlabel_clean_TRAIN_Aug.xlsx'), 'tensorflow_newlabel_clean_TRAIN_Aug')
save_df(preprocess('my_data/valid/tensorflow_newlabel_clean_VALID_Aug.xlsx'), 'tensorflow_newlabel_clean_VALID_Aug', "valid")
save_df(preprocess('my_data/test/tensorflow_newlabel_clean_TEST_Aug.xlsx'), 'tensorflow_newlabel_clean_TEST_Aug', "test")

save_df(preprocess('my_data/train/caffe_newlabel_clean_TRAIN_Aug.xlsx'), 'caffe_newlabel_clean_TRAIN_Aug')
save_df(preprocess('my_data/valid/caffe_newlabel_clean_VALID_Aug.xlsx'), 'caffe_newlabel_clean_VALID_Aug', "valid")
save_df(preprocess('my_data/test/caffe_newlabel_clean_TEST_Aug.xlsx'), 'caffe_newlabel_clean_TEST_Aug', "test")


# save_df(preprocess('my_data/train/Real-Time-Voice-Cloning_newlabel_clean_TRAIN_Aug.xlsx'), 'Real-Time-Voice-Cloning_newlabel_clean_TRAIN_Aug')
# save_df(preprocess('my_data/valid/Real-Time-Voice-Cloning_newlabel_clean_VALID_Aug.xlsx'), 'Real-Time-Voice-Cloning_newlabel_clean_VALID_Aug', "valid")
# save_df(preprocess('my_data/test/Real-Time-Voice-Cloning_newlabel_clean_TEST_Aug.xlsx'), 'Real-Time-Voice-Cloning_newlabel_clean_TEST_Aug', "test")

# save_df(preprocess('my_data/train/openpose_newlabel_clean_TRAIN_Aug.xlsx'), 'openpose_newlabel_clean_TRAIN_Aug')
# save_df(preprocess('my_data/valid/openpose_newlabel_clean_VALID_Aug.xlsx'), 'openpose_newlabel_clean_VALID_Aug', "valid")
# save_df(preprocess('my_data/test/openpose_newlabel_clean_TEST_Aug.xlsx'), 'openpose_newlabel_clean_TEST_Aug', "test")

# save_df(preprocess('my_data/train/pytorch-CycleGAN-and-pix2pix_newlabel_clean_TRAIN_Aug.xlsx'), 'pytorch-CycleGAN-and-pix2pix_newlabel_clean_TRAIN_Aug')
# save_df(preprocess('my_data/valid/pytorch-CycleGAN-and-pix2pix_newlabel_clean_VALID_Aug.xlsx'), 'pytorch-CycleGAN-and-pix2pix_newlabel_clean_VALID_Aug', "valid")
# save_df(preprocess('my_data/test/pytorch-CycleGAN-and-pix2pix_newlabel_clean_TEST_Aug.xlsx'), 'pytorch-CycleGAN-and-pix2pix_newlabel_clean_TEST_Aug', "test")

# save_df(preprocess('my_data/train/deepfacelab_newlabel_clean_TRAIN_Aug.xlsx'), 'deepfacelab_newlabel_clean_TRAIN_Aug')
# save_df(preprocess('my_data/valid/deepfacelab_newlabel_clean_VALID_Aug.xlsx'), 'deepfacelab_newlabel_clean_VALID_Aug', "valid")
# save_df(preprocess('my_data/test/deepfacelab_newlabel_clean_TEST_Aug.xlsx'), 'deepfacelab_newlabel_clean_TEST_Aug', "test")

# save_df(preprocess('my_data/train/faceswap_newlabel_clean_TRAIN_Aug.xlsx'), 'faceswap_newlabel_clean_TRAIN_Aug')
# save_df(preprocess('my_data/valid/faceswap_newlabel_clean_VALID_Aug.xlsx'), 'faceswap_newlabel_clean_VALID_Aug', "valid")
# save_df(preprocess('my_data/test/faceswap_newlabel_clean_TEST_Aug.xlsx'), 'faceswap_newlabel_clean_TEST_Aug', "test")

# save_df(preprocess('my_data/train/faceswap_newlabel_clean_TRAIN_Aug.xlsx'), 'faceswap_newlabel_clean_TRAIN_Aug')
# save_df(preprocess('my_data/test/faceswap_newlabel_clean_TEST_Aug.xlsx'), 'faceswap_newlabel_clean_TEST_Aug', False)

# save_df(preprocess('my_data/train/EasyOCR_newlabel_clean_TRAIN_Aug.xlsx'), 'EasyOCR_newlabel_clean_TRAIN_Aug')
# save_df(preprocess('my_data/valid/EasyOCR_newlabel_clean_VALID_Aug.xlsx'), 'EasyOCR_newlabel_clean_VALID_Aug', "valid")
# save_df(preprocess('my_data/test/EasyOCR_newlabel_clean_TEST_Aug.xlsx'), 'EasyOCR_newlabel_clean_TEST_Aug', "test")

# save_df(preprocess('my_data/train/EasyOCR_newlabel_with_comments_clean_TRAIN_Aug.xlsx'), 'EasyOCR_newlabel_with_comments_clean_TRAIN_Aug')
# save_df(preprocess('my_data/valid/EasyOCR_newlabel_with_comments_clean_VALID_Aug.xlsx'), 'EasyOCR_newlabel_with_comments_clean_VALID_Aug', "valid")
# save_df(preprocess('my_data/test/EasyOCR_newlabel_with_comments_clean_TEST_Aug.xlsx'), 'EasyOCR_newlabel_with_comments_clean_TEST_Aug', "test")

# save_df(preprocess('my_data/train/EasyOCR_newlabel_clean_TRAIN_Aug.xlsx'), 'EasyOCR_newlabel_clean_TRAIN_Aug')
# save_df(preprocess('my_data/valid/EasyOCR_newlabel_clean_VALID_Aug.xlsx'), 'EasyOCR_newlabel_clean_VALID_Aug', "valid")
# save_df(preprocess('my_data/test/EasyOCR_newlabel_clean_TEST_Aug.xlsx'), 'EasyOCR_newlabel_clean_TEST_Aug', "test")

# save_df(preprocess('my_data/train/deepfacelab_newlabel_clean_TRAIN_Aug.xlsx'), 'deepfacelab_newlabel_clean_TRAIN_Aug')
# save_df(preprocess('my_data/test/deepfacelab_newlabel_clean_TEST_Aug.xlsx'), 'deepfacelab_newlabel_clean_TEST_Aug', False)

# save_df(preprocess('my_data/train/Real-Time-Voice-Cloning_newlabel_clean_TRAIN_Aug.xlsx'), 'Real-Time-Voice-Cloning_newlabel_clean_TRAIN_Aug')
# save_df(preprocess('my_data/test/Real-Time-Voice-Cloning_newlabel_clean_TEST_Aug.xlsx'), 'Real-Time-Voice-Cloning_newlabel_clean_TEST_Aug', False)

# save_df(preprocess('my_data/train/recommenders_newlabel_TRAIN_Aug.xlsx'), 'recommenders_newlabel_TRAIN_Aug')
# save_df(preprocess('my_data/test/recommenders_newlabel_clean_TEST_Aug.xlsx'), 'recommenders_newlabel_clean_TEST_Aug', False)

# save_df(preprocess('my_data/train/TTS_newlabel_clean_TRAIN_Aug.xlsx'), 'TTS_newlabel_clean_TRAIN_Aug')
# save_df(preprocess('my_data/test/TTS_newlabel_clean_TEST_Aug.xlsx'), 'TTS_newlabel_clean_TEST_Aug', False)



# save_df(preprocess_vote('my_data/test/streamlit_new_clean_TEST_AugVote2.xlsx'), 'streamlit_new_clean_TEST_AugVote2', False)

# save_df(preprocess('my_data/train/streamlit_clean_1_TRAIN_Aug.xlsx'), 'streamlit_clean_1_TRAIN_Aug')
# save_df(preprocess('my_data/test/streamlit_clean_1_TEST_Aug.xlsx'), 'streamlit_clean_1_TEST_Aug', False)

# save_df(preprocess('my_data/train/faceswap_TRAIN_Aug.xlsx'), 'faceswap_TRAIN_Aug')
# save_df(preprocess('my_data/test/faceswap_TEST_Aug.xlsx'), 'faceswap_TEST_Aug', False)

# save_df(preprocess('my_data/train/deepfacelab_TRAIN_Aug.xlsx'), 'deepfacelab_TRAIN_Aug')
# save_df(preprocess('my_data/test/deepfacelab_TEST_Aug.xlsx'), 'deepfacelab_TEST_Aug', False)

# save_df(preprocess('my_data/train/streamlit_TRAIN_Aug.xlsx'), 'streamlit_TRAIN_Aug')
# save_df(preprocess('my_data/test/streamlit_TEST_Aug.xlsx'), 'streamlit_TEST_Aug', False)
# save_df(preprocess_vote('my_data/test/streamlit_TEST_Aug.xlsx'), 'streamlit_TEST_Aug', False)

# save_df(preprocess('my_data/train/streamlit_newlabel_TRAIN_Aug.xlsx'), 'streamlit_newlabel_TRAIN_Aug')
# save_df(preprocess('my_data/test/streamlit_newlabel_TEST_Aug.xlsx'), 'streamlit_newlabel_TEST_Aug', False)

before drop, the length of JaidedAI_EasyOCR is:4545
after drop, the length of JaidedAI_EasyOCR is:4536
before drop, the length of CorentinJ_Real-Time-Voice-Cloning is:329
after drop, the length of CorentinJ_Real-Time-Voice-Cloning is:326
before drop, the length of junyanz_pytorch-CycleGAN-and-pix2pix is:549
after drop, the length of junyanz_pytorch-CycleGAN-and-pix2pix is:545
before drop, the length of deezer_spleeter is:5765
after drop, the length of deezer_spleeter is:5762
before drop, the length of streamlit_streamlit is:672
after drop, the length of streamlit_streamlit is:672
before drop, the length of streamlit_streamlit is:840
after drop, the length of streamlit_streamlit is:839
before drop, the length of CMU-Perceptual-Computing-Lab_openpose is:4135
after drop, the length of CMU-Perceptual-Computing-Lab_openpose is:4135
before drop, the length of CMU-Perceptual-Computing-Lab_openpose is:327
after drop, the length of CMU-Perceptual-Computing-Lab_openpose is:326
before drop, the l

In [5]:
# files = glob.glob('./my_data/train/*.xlsx')
# for f in files:
#     print(f)

# dfs = []
# for f in files:
#     dfs.append(preprocess(f))
# concatenate_df_and_save(dfs)    

## process one file

In [None]:
test_file = './my_data/test/TTS.xlsx'
def save_test_to_txt(file):
    df = preprocess(file)
    file_name = file.split('/')[-1].split('.')[0]
    dir_name = os.path.join('./my_data/test', file_name)
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
    
    out_path = os.path.join(dir_name, f'{file_name}.txt')
    result = df[['title', 'description', 'labels']].to_json(orient="records")
    parsed = json.loads(result)
    with open(out_path, 'w') as f:
        json.dump(parsed, f)

save_test_to_txt(test_file)

before drop, the length of mozilla_TTS is:506
after drop, the length of mozilla_TTS is:478
