In [1]:
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict, concatenate_datasets
import numpy as np
import pandas as pd
from scipy.stats import wasserstein_distance


import json
import matplotlib.pyplot as plt

In [2]:
# label: true = 0, fake = 1

In [3]:
experiment_name = "gemma_chat_10k"
fake_train_dataset_df = pd.read_json(f"fake_true_datasets/fake_true_dataset_{experiment_name}_train.json")

In [4]:
fake_train_dataset_df

Unnamed: 0,text,label
0,[Four groups that advocate for immigrant right...,1
1,[Former Vice President Dick Cheney on Sunday d...,1
2,[Former Vice President Dick Cheney on Sunday d...,0
3,[Space shuttle Discovery launched just before ...,1
4,[Space shuttle Discovery launched just before ...,0
...,...,...
14199,[The Federal Aviation Administration ordered U...,1
14200,[A Web designer in London was amazed to discov...,1
14201,[A Web designer in London was amazed to discov...,0
14202,[The mother of a 17-year-old girl who disappea...,1


In [5]:
fake_train_dataset_df.iloc[0]["text"][0]

'Four groups that advocate for immigrant rights said Thursday they would continue their fight for a pathway to citizenship for undocumented immigrants, despite recent setbacks in the legislative process. The Immigration Reform Coalition, the National Immigration Forum, the United Way for Immigration, and the American Civil Liberties Union (ACLU) made the statement as they joined forces to urge lawmakers to prioritize the issue of undocumented immigration reform. "The fight for immigrant rights is'

In [6]:
fake_train_dataset_df["text_sentences"] = fake_train_dataset_df["text"].apply(lambda x: x[0].split("."))

fake_texts_df = fake_train_dataset_df[fake_train_dataset_df["label"] == 1]
true_texts_df = fake_train_dataset_df[fake_train_dataset_df["label"] == 0]

In [7]:
fake_train_dataset_df["text_sentences"][0]

['Four groups that advocate for immigrant rights said Thursday they would continue their fight for a pathway to citizenship for undocumented immigrants, despite recent setbacks in the legislative process',
 ' The Immigration Reform Coalition, the National Immigration Forum, the United Way for Immigration, and the American Civil Liberties Union (ACLU) made the statement as they joined forces to urge lawmakers to prioritize the issue of undocumented immigration reform',
 ' "The fight for immigrant rights is']

In [8]:
print(f"Average number of sentences in fake texts: {np.mean(fake_texts_df['text_sentences'].apply(len))}")
print(f"Average number of sentences in true texts: {np.mean(true_texts_df['text_sentences'].apply(len))}")

Average number of sentences in fake texts: 4.673937517590768
Average number of sentences in true texts: 5.242885319808397


In [9]:
# add column: number of "the" in text
fake_texts_df["the_count"] = fake_texts_df["text"].apply(lambda x: x[0].count("the"))
true_texts_df["the_count"] = true_texts_df["text"].apply(lambda x: x[0].count("the"))

print(f"Average number of 'the' in fake texts: {np.mean(fake_texts_df['the_count'])}")
print(f"Average number of 'the' in true texts: {np.mean(true_texts_df['the_count'])}")

Average number of 'the' in fake texts: 5.513650436251056
Average number of 'the' in true texts: 5.121442659904198


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fake_texts_df["the_count"] = fake_texts_df["text"].apply(lambda x: x[0].count("the"))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  true_texts_df["the_count"] = true_texts_df["text"].apply(lambda x: x[0].count("the"))


In [10]:
fake_train_dataset_full_text = " ".join([text for text in fake_train_dataset_df["text"].apply(lambda x: x[0])])

In [11]:
len(fake_train_dataset_full_text)

7116203

In [12]:
# run through all the texts and count occurence of each characters in unicode representation

def count_chars(texts):
    char_counts = {}
    for text in texts:
        for char in text:
            if char in char_counts:
                char_counts[char] += 1
            else:
                char_counts[char] = 1
    return char_counts

fake_char_counts = count_chars(fake_train_dataset_full_text)

In [13]:
fake_char_counts_sorted = dict(sorted(fake_char_counts.items(), key=lambda item: item[1], reverse=True))
fake_char_counts_sorted

{' ': 1165616,
 'e': 681185,
 'a': 495961,
 't': 457634,
 'i': 428984,
 'n': 419228,
 'o': 397668,
 's': 365089,
 'r': 360793,
 'h': 260497,
 'd': 230793,
 'l': 219325,
 'c': 181788,
 'u': 144796,
 'm': 126976,
 'f': 119782,
 'g': 118167,
 'p': 104800,
 'y': 91513,
 'w': 87421,
 'b': 70961,
 ',': 69396,
 'v': 57288,
 '.': 56223,
 'k': 39438,
 'T': 30983,
 'S': 20639,
 'A': 20602,
 '-': 18010,
 '"': 16882,
 "'": 16057,
 'C': 15502,
 'M': 13674,
 'I': 13075,
 '0': 12363,
 'P': 10365,
 'B': 10233,
 'x': 9932,
 'H': 9510,
 'N': 8695,
 'W': 8655,
 '1': 8412,
 'F': 8336,
 '2': 7968,
 'D': 7540,
 'j': 7137,
 'z': 6591,
 'R': 6398,
 'L': 6098,
 'J': 5890,
 'O': 5809,
 'G': 5483,
 'U': 5260,
 'q': 4608,
 'E': 4524,
 'K': 3824,
 '3': 3599,
 '5': 3481,
 '9': 2832,
 '4': 2604,
 'V': 2514,
 '*': 2438,
 '8': 2244,
 '7': 2233,
 '6': 2161,
 ':': 1826,
 'Y': 1696,
 '(': 1413,
 ')': 1381,
 '$': 902,
 'Z': 761,
 '?': 553,
 'Q': 543,
 '[': 527,
 ']': 515,
 ';': 246,
 '%': 238,
 'X': 191,
 '!': 177,
 '/': 

In [14]:
# convert keys in char_counts to unicode
fake_char_counts_unicode = {ord(k): v for k, v in fake_char_counts_sorted.items()}
fake_char_counts_unicode

{32: 1165616,
 101: 681185,
 97: 495961,
 116: 457634,
 105: 428984,
 110: 419228,
 111: 397668,
 115: 365089,
 114: 360793,
 104: 260497,
 100: 230793,
 108: 219325,
 99: 181788,
 117: 144796,
 109: 126976,
 102: 119782,
 103: 118167,
 112: 104800,
 121: 91513,
 119: 87421,
 98: 70961,
 44: 69396,
 118: 57288,
 46: 56223,
 107: 39438,
 84: 30983,
 83: 20639,
 65: 20602,
 45: 18010,
 34: 16882,
 39: 16057,
 67: 15502,
 77: 13674,
 73: 13075,
 48: 12363,
 80: 10365,
 66: 10233,
 120: 9932,
 72: 9510,
 78: 8695,
 87: 8655,
 49: 8412,
 70: 8336,
 50: 7968,
 68: 7540,
 106: 7137,
 122: 6591,
 82: 6398,
 76: 6098,
 74: 5890,
 79: 5809,
 71: 5483,
 85: 5260,
 113: 4608,
 69: 4524,
 75: 3824,
 51: 3599,
 53: 3481,
 57: 2832,
 52: 2604,
 86: 2514,
 42: 2438,
 56: 2244,
 55: 2233,
 54: 2161,
 58: 1826,
 89: 1696,
 40: 1413,
 41: 1381,
 36: 902,
 90: 761,
 63: 553,
 81: 543,
 91: 527,
 93: 515,
 59: 246,
 37: 238,
 88: 191,
 33: 177,
 47: 154,
 38: 129,
 233: 81,
 43: 45,
 163: 36,
 8226: 31,
 2

In [15]:
# exclude from count all ascii characters, ie. all keys above 128
fake_char_counts_special = {k: v for k, v in fake_char_counts_sorted.items() if ord(k) > 128}
fake_char_counts_special

{'é': 81,
 '£': 36,
 '•': 31,
 'ó': 21,
 'á': 21,
 'í': 16,
 '½': 15,
 '€': 15,
 'ñ': 14,
 'ã': 13,
 '°': 12,
 '–': 12,
 '»': 9,
 'ü': 8,
 '“': 8,
 'Á': 8,
 'ć': 8,
 '—': 6,
 'ö': 5,
 '”': 5,
 'è': 5,
 'ç': 5,
 'ä': 5,
 'ú': 4,
 '\xad': 3,
 'ø': 2,
 '’': 2,
 'ı': 2,
 'þ': 2,
 'ï': 2,
 'ğ': 2,
 'É': 2,
 '₹': 2,
 'š': 2,
 'ž': 2,
 'â': 1,
 'ô': 1,
 'à': 1,
 'Ş': 1,
 'ś': 1,
 'ë': 1,
 '®': 1,
 'û': 1,
 'ā': 1,
 '👑': 1,
 '💕': 1,
 'ń': 1,
 '¥': 1,
 'å': 1,
 'Ø': 1,
 'ă': 1}

In [18]:
# count different kind of apostrophes
count_apostrophe_type_1 = 0
count_apostrophe_type_2 = 0

# iterate over fake texts and count occurence of different apostrophes
for text in fake_texts_df["text"].apply(lambda x: x[0]):
    count_apostrophe_type_1 += text.count("'")
    count_apostrophe_type_2 += text.count("’")

print(f"Number of normal apostrophes: {count_apostrophe_type_1}")
print(f"Number of special apostrophes: {count_apostrophe_type_2}")

Number of normal apostrophes: 6457
Number of special apostrophes: 2


In [36]:
# convert form unicode to character
chr(8212)

'—'

In [38]:
chr(8211)

'–'