In [1]:
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict, concatenate_datasets
import numpy as np
import pandas as pd
from scipy.stats import wasserstein_distance


import json
import matplotlib.pyplot as plt

In [2]:
# label: true = 0, fake = 1

In [3]:
experiment_name = "gpt2_10k"
fake_train_dataset_df = pd.read_json(f"fake_true_datasets/fake_true_dataset_{experiment_name}_train.json")

In [4]:
fake_train_dataset_df

Unnamed: 0,text,label
0,[Four groups that advocate for immigrant right...,0
1,[Four groups that advocate for immigrant right...,1
2,[Former Vice President Dick Cheney on Sunday d...,1
3,[Former Vice President Dick Cheney on Sunday d...,0
4,[Space shuttle Discovery launched just before ...,0
...,...,...
15891,[South Africa pace bowler Dale Steyn ripped th...,1
15892,[South Africa pace bowler Dale Steyn ripped th...,0
15893,"[In a bustling room full of computers, giant w...",0
15894,"[In a bustling room full of computers, giant w...",1


In [5]:
fake_train_dataset_df.iloc[0]["text"][0]

'Four groups that advocate for immigrant rights said Thursday they will challenge Arizona\'s new immigration law, which allows police to ask anyone for proof of legal U.S. residency. The Mexican American Legal Defense and Educational Fund, the American Civil Liberties Union, the ACLU of Arizona and the National Immigration Law Center held a news conference Thursday in Phoenix to announce the legal challenge. "The Arizona community can be assured that a vigorous and sophisticated legal challenge wi'

In [6]:
fake_train_dataset_df["text_sentences"] = fake_train_dataset_df["text"].apply(lambda x: x[0].split("."))

fake_texts_df = fake_train_dataset_df[fake_train_dataset_df["label"] == 1]
true_texts_df = fake_train_dataset_df[fake_train_dataset_df["label"] == 0]

In [7]:
fake_train_dataset_df["text_sentences"][0]

["Four groups that advocate for immigrant rights said Thursday they will challenge Arizona's new immigration law, which allows police to ask anyone for proof of legal U",
 'S',
 ' residency',
 ' The Mexican American Legal Defense and Educational Fund, the American Civil Liberties Union, the ACLU of Arizona and the National Immigration Law Center held a news conference Thursday in Phoenix to announce the legal challenge',
 ' "The Arizona community can be assured that a vigorous and sophisticated legal challenge wi']

In [8]:
print(f"Average number of sentences in fake texts: {np.mean(fake_texts_df['text_sentences'].apply(len))}")
print(f"Average number of sentences in true texts: {np.mean(true_texts_df['text_sentences'].apply(len))}")

Average number of sentences in fake texts: 3.2786514026921627
Average number of sentences in true texts: 5.230778910280609


In [9]:
# add column: number of "the" in text
fake_texts_df["the_count"] = fake_texts_df["text"].apply(lambda x: x[0].count("the"))
true_texts_df["the_count"] = true_texts_df["text"].apply(lambda x: x[0].count("the"))

print(f"Average number of 'the' in fake texts: {np.mean(fake_texts_df['the_count'])}")
print(f"Average number of 'the' in true texts: {np.mean(true_texts_df['the_count'])}")

Average number of 'the' in fake texts: 3.2274499937099006
Average number of 'the' in true texts: 5.14080785201963


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fake_texts_df["the_count"] = fake_texts_df["text"].apply(lambda x: x[0].count("the"))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  true_texts_df["the_count"] = true_texts_df["text"].apply(lambda x: x[0].count("the"))


In [10]:
fake_train_dataset_full_text = " ".join([text for text in fake_train_dataset_df["text"].apply(lambda x: x[0])])

In [11]:
len(fake_train_dataset_full_text)

7963895

In [18]:
# run through all the texts and count occurence of each characters in unicode representation

def count_chars(texts):
    char_counts = {}
    for text in texts:
        for char in text:
            if char in char_counts:
                char_counts[char] += 1
            else:
                char_counts[char] = 1
    return char_counts

fake_char_counts = count_chars(fake_train_dataset_full_text)
true_char_counts = count_chars(" ".join([text for text in true_texts_df["text"].apply(lambda x: x[0])]))

In [13]:
fake_char_counts_sorted = dict(sorted(fake_char_counts.items(), key=lambda item: item[1], reverse=True))
fake_char_counts_sorted

{' ': 1338478,
 'e': 742644,
 'a': 544795,
 't': 504932,
 'i': 458463,
 'o': 457655,
 'n': 450094,
 'r': 410526,
 's': 399733,
 'h': 291556,
 'l': 248791,
 'd': 243313,
 'c': 172994,
 'u': 161707,
 'm': 145277,
 'g': 131199,
 'f': 128126,
 'w': 114671,
 'y': 114521,
 'p': 111566,
 'b': 88241,
 'v': 59831,
 '.': 51735,
 ',': 49898,
 'k': 48388,
 'T': 30333,
 'S': 28164,
 'A': 27058,
 "'": 24791,
 '"': 23964,
 '-': 23175,
 'C': 19343,
 'I': 18883,
 'M': 17731,
 'P': 17462,
 '0': 15067,
 'B': 13283,
 'N': 12431,
 '1': 12388,
 'H': 11603,
 'W': 11442,
 'F': 10835,
 'x': 10431,
 'R': 9495,
 'D': 9444,
 '2': 9359,
 'j': 8235,
 'J': 8091,
 'O': 7598,
 'L': 7545,
 '(': 7536,
 ')': 7425,
 'G': 7371,
 'U': 7164,
 ':': 6618,
 'z': 6469,
 'E': 6192,
 'K': 4895,
 'q': 4725,
 '3': 4281,
 '5': 4151,
 '4': 3577,
 '9': 3555,
 'V': 3045,
 '6': 2947,
 '7': 2913,
 '—': 2834,
 '8': 2832,
 'Y': 2782,
 '/': 2357,
 ';': 1481,
 '$': 1360,
 '?': 1214,
 '!': 1120,
 '–': 1102,
 'Z': 978,
 '[': 669,
 ']': 663,
 'Q

In [19]:
# convert keys in char_counts to unicode
fake_char_counts_unicode = {ord(k): v for k, v in fake_char_counts_sorted.items()}
true_char_counts_unicode = {ord(k): v for k, v in true_char_counts.items()}
fake_char_counts_unicode

{32: 1338478,
 101: 742644,
 97: 544795,
 116: 504932,
 105: 458463,
 111: 457655,
 110: 450094,
 114: 410526,
 115: 399733,
 104: 291556,
 108: 248791,
 100: 243313,
 99: 172994,
 117: 161707,
 109: 145277,
 103: 131199,
 102: 128126,
 119: 114671,
 121: 114521,
 112: 111566,
 98: 88241,
 118: 59831,
 46: 51735,
 44: 49898,
 107: 48388,
 84: 30333,
 83: 28164,
 65: 27058,
 39: 24791,
 34: 23964,
 45: 23175,
 67: 19343,
 73: 18883,
 77: 17731,
 80: 17462,
 48: 15067,
 66: 13283,
 78: 12431,
 49: 12388,
 72: 11603,
 87: 11442,
 70: 10835,
 120: 10431,
 82: 9495,
 68: 9444,
 50: 9359,
 106: 8235,
 74: 8091,
 79: 7598,
 76: 7545,
 40: 7536,
 41: 7425,
 71: 7371,
 85: 7164,
 58: 6618,
 122: 6469,
 69: 6192,
 75: 4895,
 113: 4725,
 51: 4281,
 53: 4151,
 52: 3577,
 57: 3555,
 86: 3045,
 54: 2947,
 55: 2913,
 8212: 2834,
 56: 2832,
 89: 2782,
 47: 2357,
 59: 1481,
 36: 1360,
 63: 1214,
 33: 1120,
 8211: 1102,
 90: 978,
 91: 669,
 93: 663,
 81: 648,
 38: 494,
 8230: 484,
 124: 332,
 88: 328,
 

In [23]:
# exclude from count all ascii characters, ie. all keys above 128
fake_char_counts_special = {k: v for k, v in fake_char_counts_sorted.items() if ord(k) > 128}
fake_char_counts_special

{'—': 2834,
 '–': 1102,
 '…': 484,
 '�': 90,
 '»': 90,
 '£': 83,
 'é': 70,
 '´': 68,
 '•': 57,
 '‑': 31,
 'á': 29,
 '©': 27,
 '›': 27,
 '€': 25,
 'ñ': 25,
 '‐': 22,
 'í': 20,
 'ó': 19,
 '·': 18,
 'ö': 17,
 '½': 16,
 'ü': 10,
 '×': 9,
 '\xad': 8,
 '\u200b': 6,
 '°': 6,
 '®': 6,
 '→': 5,
 '🙂': 5,
 'ã': 5,
 '′': 4,
 'Á': 4,
 '″': 4,
 'ë': 4,
 'è': 4,
 'ú': 4,
 'ä': 4,
 '―': 4,
 'â': 3,
 'Â': 3,
 'ğ': 3,
 '™': 3,
 'É': 3,
 'ô': 2,
 'Þ': 2,
 'ا': 2,
 'ل': 2,
 'ï': 2,
 'å': 2,
 'ç': 2,
 '¥': 2,
 'ø': 2,
 'ð': 1,
 '\ue606': 1,
 '货': 1,
 '源': 1,
 '\ue607': 1,
 'ه': 1,
 'ع': 1,
 'ن': 1,
 'د': 1,
 'ي': 1,
 'م': 1,
 'à': 1,
 '「': 1,
 '」': 1,
 'ı': 1,
 'ć': 1,
 '\uf04b': 1,
 '自': 1,
 '洋': 1,
 '\uf099': 1,
 '\uf101': 1,
 '🇨': 1,
 '🍾': 1,
 '날': 1,
 '자': 1,
 '과': 1,
 '陳': 1,
 '家': 1,
 '\ue800': 1,
 'ㅠ': 1,
 '😉': 1,
 'č': 1,
 'š': 1}

In [29]:
true_char_counts_special = {k: v for k, v in true_char_counts.items() if ord(k) > 128}

# sort by value true_char_counts_special
true_char_counts_special_sorted = dict(sorted(true_char_counts_special.items(), key=lambda item: item[1], reverse=True))
true_char_counts_special_sorted

{'•': 38,
 'é': 33,
 '£': 19,
 '½': 12,
 'ñ': 12,
 '»': 10,
 '€': 7,
 'í': 4,
 'ä': 4,
 'ã': 4,
 '\xad': 3,
 '°': 3,
 'è': 3,
 'ú': 3,
 'ö': 3,
 'á': 2,
 'ó': 2,
 'â': 1,
 'ô': 1,
 'à': 1,
 'ë': 1,
 '®': 1,
 'Á': 1,
 'ï': 1,
 'ü': 1,
 'É': 1,
 '¥': 1,
 'ø': 1,
 'å': 1}

In [32]:
# count different kind of apostrophes
count_apostrophe_type_1 = 0
count_apostrophe_type_2 = 0

# iterate over fake texts and count occurence of different apostrophes
for text in true_texts_df["text"].apply(lambda x: x[0]):
    count_apostrophe_type_1 += text.count("'")
    count_apostrophe_type_2 += text.count("’")

print(f"Number of normal apostrophes: {count_apostrophe_type_1}")
print(f"Number of special apostrophes: {count_apostrophe_type_2}")

Number of normal apostrophes: 10903
Number of special apostrophes: 0


In [31]:
# count different kind of apostrophes
count_apostrophe_type_1 = 0
count_apostrophe_type_2 = 0

# iterate over fake texts and count occurence of different apostrophes
for text in fake_texts_df["text"].apply(lambda x: x[0]):
    count_apostrophe_type_1 += text.count("'")
    count_apostrophe_type_2 += text.count("’")

print(f"Number of normal apostrophes: {count_apostrophe_type_1}")
print(f"Number of special apostrophes: {count_apostrophe_type_2}")

Number of normal apostrophes: 13888
Number of special apostrophes: 0


In [36]:
# convert form unicode to character
chr(8212)

'—'

In [38]:
chr(8211)

'–'