In [1]:
import pandas as pd

where_to_save = "tvt2_3"

data = pd.read_csv("../../data/processed/liarpantsfire_dataset.csv", lineterminator="\n")
print(data.shape)
data.head()

(12791, 6)


Unnamed: 0,id,statement,label,tvt2,tvt2_1,tvt2_2
0,2635.json,Says the Annies List political group supports ...,false,validation,validation,training
1,10540.json,When did the decline of coal start? It started...,half-true,training,training,validation
2,324.json,"Hillary Clinton agrees with John McCain ""by vo...",mostly-true,training,training,validation
3,1123.json,Health care reform legislation is likely to ma...,false,training,validation,testting
4,9028.json,The economic turnaround started at the end of ...,half-true,training,training,testting


In [2]:
length = data.shape[0]
length

12791

In [3]:
import random

# random.seed(33)

train_w = [0 for i in range(675)]
val_w = [1 for i in range(225)]
test_w = [2 for i in range(100)]

weights = train_w + val_w + test_w
print(f"weights : {len(weights)}")

tvt = []
for i in range(length):
    gacha = random.sample(weights, 1)[0]
    if gacha == 0:
        tvt.append("training")
    elif gacha == 1:
        tvt.append("validation")
    else:
        tvt.append("testting")

print(f"Training : {tvt.count('training')} - {round(tvt.count('training')/length, 3)}")
print(f"Validation : {tvt.count('validation')} - {round(tvt.count('validation')/length, 3)}")
print(f"Testing : {tvt.count('testting')} - {round(tvt.count('testting')/length, 3)}")

weights : 1000
Training : 8598 - 0.672
Validation : 2926 - 0.229
Testing : 1267 - 0.099


In [4]:
data[where_to_save] = pd.Series(tvt)
data.head()

Unnamed: 0,id,statement,label,tvt2,tvt2_1,tvt2_2,tvt2_3
0,2635.json,Says the Annies List political group supports ...,false,validation,validation,training,training
1,10540.json,When did the decline of coal start? It started...,half-true,training,training,validation,training
2,324.json,"Hillary Clinton agrees with John McCain ""by vo...",mostly-true,training,training,validation,validation
3,1123.json,Health care reform legislation is likely to ma...,false,training,validation,testting,training
4,9028.json,The economic turnaround started at the end of ...,half-true,training,training,testting,training


In [5]:
label_count = data['label'].value_counts()
label_count

half-true      2627
false          2507
mostly-true    2454
barely-true    2103
true           2053
pants-fire     1047
Name: label, dtype: int64

In [6]:
data['label'].unique()

array(['false', 'half-true', 'mostly-true', 'true', 'barely-true',
       'pants-fire'], dtype=object)

In [7]:
combination = data.apply(lambda row: f"{row['label']}_{row[where_to_save]}", axis=1).value_counts()
comparison = {}
for k, comb in combination.items():
    cv_fold = k.split("_")[1]
    label = k.split("_")[0]

    if cv_fold not in comparison:
        comparison[cv_fold] = {}
    
    comparison[cv_fold][label] = comb

labels = data['label'].unique().tolist()

def label_ratio(label_dict, labels):
    total = sum([v for k, v in label_dict.items()])
    
    report = ""
    for l in labels:
        report += f"{round(label_dict[l]/total, 2)}\t"
    
    return report[:-1]

def label_raw_value(label_dict, labels):
    total = sum([v for k, v in label_dict.items()])
    
    report = ""
    for l in labels:
        report += f"{label_dict[l]}\t"
    
    return report[:-1]

labels_str = ','.join(labels)
print(f"\nLabel,{labels_str}")
print(f"Original\t{label_ratio(label_count, labels)}")
for cv, comp in comparison.items():
    print(f"{cv.title()}\t{label_ratio(comp, labels)}")
    
labels_str = ','.join(labels)
print(f"\nLabel,{labels_str}")
print(f"Original\t{label_raw_value(label_count, labels)}")
for cv, comp in comparison.items():
    print(f"{cv.title()}\t{label_raw_value(comp, labels)}")
    
print(f"\nTraining : {tvt.count('training')} - {round(tvt.count('training')/length, 3)}")
print(f"Validation : {tvt.count('validation')} - {round(tvt.count('validation')/length, 3)}")
print(f"Testing : {tvt.count('testting')} - {round(tvt.count('testting')/length, 3)}")


Label,false,half-true,mostly-true,true,barely-true,pants-fire
Original	0.2	0.21	0.19	0.16	0.16	0.08
Training	0.2	0.21	0.19	0.16	0.17	0.08
Validation	0.19	0.21	0.19	0.16	0.16	0.09
Testting	0.2	0.21	0.2	0.15	0.17	0.08

Label,false,half-true,mostly-true,true,barely-true,pants-fire
Original	2507	2627	2454	2053	2103	1047
Training	1697	1764	1632	1388	1429	688
Validation	556	603	570	479	464	254
Testting	254	260	252	186	210	105

Training : 8598 - 0.672
Validation : 2926 - 0.229
Testing : 1267 - 0.099


In [9]:
data.to_csv("../../data/processed/liarpantsfire_dataset.csv", index=False)