In [None]:
import numpy as np
import pandas as pd
import re

In [None]:
df = pd.read_csv('./arithmetic.csv')
df

Unnamed: 0,src,tgt
0,0+0=,0
1,0-0=,0
2,0*0=,0
3,(0+0)*0=,0
4,0+0*0=,0
...,...,...
2632495,(49+49)-49=,49
2632496,49+(49-49)=,49
2632497,49-49+49=,49
2632498,(49-49)+49=,49


In [None]:
# 定義函式用於檢查是否符合條件
def check_PlusAndMinus(row):
    # 檢查是否包含三個二位數字的加減法
    src = row['src']
    # 使用正則表達式找到所有的數字
    numbers = re.findall(r'\b\d{2}\b', src)
    # 過濾包含加減號以外的運算式
    return len(numbers) >= 3 and ('+' in src or '-' in src) and ('*' not in src and '/' not in src)

In [None]:
filtered_df = df[df.apply(check_PlusAndMinus, axis=1)]
filtered_df

Unnamed: 0,src,tgt
537255,10+10+10=,30
537256,10-10-10=,-10
537258,10+10-10=,10
537259,(10+10)-10=,10
537260,10+(10-10)=,10
...,...,...
2632495,(49+49)-49=,49
2632496,49+(49-49)=,49
2632497,49-49+49=,49
2632498,(49-49)+49=,49


In [None]:
filtered_df.to_csv('./filtered_plus_minus.csv', index=False)

In [None]:
# 取集中值(10 - 25)的二位數字運算
def check_PlusAndMinus_random(row):
    # 檢查是否包含三個二位數字的加減法
    src = row['src']
    # 使用正則表達式找到所有的數字
    numbers = re.findall(r'\b\d{2}\b', src)
    # 過濾包含加減號以外的運算式
    valid_numbers = [int(num) for num in numbers if (int(num) < 26)]
    return len(valid_numbers) >= 3 and ('+' in src or '-' in src) and ('*' not in src and '/' not in src)

In [None]:
# 過濾出符合條件的資料
filtered_df_ex1 = df[df.apply(check_PlusAndMinus_random, axis=1)]
filtered_df_ex1

Unnamed: 0,src,tgt
537255,10+10+10=,30
537256,10-10-10=,-10
537258,10+10-10=,10
537259,(10+10)-10=,10
537260,10+(10-10)=,10
...,...,...
1343119,(25+25)-25=,25
1343120,25+(25-25)=,25
1343121,25-25+25=,25
1343122,(25-25)+25=,25


In [None]:
# 如果超過30,000筆資料，隨機抽取30,000筆
if len(filtered_df_ex1) > 30000:
    filtered_df_ex1 = filtered_df_ex1.sample(n=30000, random_state=1)

# 將tgt那列的數字確定為整數
# df['tgt'] = df['tgt'].fillna(0)
# df['tgt'] = df['tgt'].apply(int)

# 將資料匯出成CSV檔案
filtered_df_ex1.to_csv('./filtered_PlusAndMinus_random.csv', index=False)

In [None]:
# 取極端值(10 - 17, 41 - 49)的二位數字運算
def check_PlusAndMinus_extreme(row):
    # 檢查是否包含三個二位數字的加減法
    src = row['src']
    # 使用正則表達式找到所有的數字
    numbers = re.findall(r'\b\d{2}\b', src)
    # 過濾包含加減號以外的運算式
    valid_numbers = [int(num) for num in numbers if (int(num) > 40) or (int(num) < 18)]
    return len(valid_numbers) >= 3 and ('+' in src or '-' in src) and ('*' not in src and '/' not in src)

In [None]:
# 過濾出符合條件的資料
filtered_df_ex2 = df[df.apply(check_PlusAndMinus_extreme, axis=1)]
filtered_df_ex2

Unnamed: 0,src,tgt
537255,10+10+10=,30
537256,10-10-10=,-10
537258,10+10-10=,10
537259,(10+10)-10=,10
537260,10+(10-10)=,10
...,...,...
2632495,(49+49)-49=,49
2632496,49+(49-49)=,49
2632497,49-49+49=,49
2632498,(49-49)+49=,49


In [None]:
# 如果超過30,000筆資料，隨機抽取30,000筆
if len(filtered_df_ex2) > 30000:
    filtered_df_ex2 = filtered_df_ex2.sample(n=30000, random_state=1)

# 將tgt那列的數字確定為整數
# df['tgt'] = df['tgt'].fillna(0)
# df['tgt'] = df['tgt'].apply(int)

# 將資料匯出成CSV檔案
filtered_df_ex2.to_csv('./filtered_PlusAndMinus_extreme.csv', index=False)

In [None]:
# 取極端值(34 - 49)的二位數字運算
def check_PlusAndMinus_out(row):
    # 檢查是否包含三個二位數字的加減法
    src = row['src']
    # 使用正則表達式找到所有的數字
    numbers = re.findall(r'\b\d{2}\b', src)
    # 過濾包含加減號以外的運算式
    valid_numbers = [int(num) for num in numbers if (int(num) > 33)]
    return len(valid_numbers) >= 3 and ('+' in src or '-' in src) and ('*' not in src and '/' not in src)

In [None]:
# 過濾出符合條件的資料
filtered_df_ex3 = df[df.apply(check_PlusAndMinus_out, axis=1)]
filtered_df_ex3

Unnamed: 0,src,tgt
1826631,34+34+34=,102
1826632,34-34-34=,-34
1826634,34+34-34=,34
1826635,(34+34)-34=,34
1826636,34+(34-34)=,34
...,...,...
2632495,(49+49)-49=,49
2632496,49+(49-49)=,49
2632497,49-49+49=,49
2632498,(49-49)+49=,49


In [None]:
# 如果超過30,000筆資料，隨機抽取30,000筆
if len(filtered_df_ex3) > 30000:
    filtered_df_ex3 = filtered_df_ex3.sample(n=30000, random_state=1)

# 將tgt那列的數字確定為整數
# df['tgt'] = df['tgt'].fillna(0)
# df['tgt'] = df['tgt'].apply(int)

# 將資料匯出成CSV檔案
filtered_df_ex3.to_csv('./filtered_PlusAndMinus_out.csv', index=False)