# Memory and Unicode

### Binary - Integers

In [1]:
num_base_2 = "100"
num_base_10 = int(num_base_2,2)
print(num_base_10)

4


In [6]:
def binary_addition(a, b):
    return bin(int(a, 2) + int(b, 2))[2:]

a = "100" # = 4 + 0 + 0 = 4
b = "111" # = 4 + 2 + 1 = 7

c = binary_addition(a,b)
print(c) # "1011" = 8 + 0 + 2 + 1 = 11 = 4 + 7

1011


### Binary - ASCII Strings
* ASCII is a subset of Unicode

In [10]:
# ASCII character integer
ord('a')

97

In [14]:
# ASCII character binary
# ( includes '0b' prefix to indicate binary )

bin(ord('a'))

'0b1100001'

### Binary - Unicode Strings / Code Points

In [19]:
# Code Points
code_point = "\u27F6"
print(code_point)

⟶


In [20]:
# Code Point Integer
ord(code_point)

10230

In [23]:
# Code Point in Binary
bin(ord(code_point))

'0b10011111110110'

In [31]:
# Unicode Code Points
# ( includes '\u' prefix to indicate that the next four digits are a Unicode code point )

s1 = "café"
s2 = "caf\u00e9" 
print(s1,s2,(s1==s2))

café café True


### Bytes ( A byte is 8 bits )

In [33]:
# Unicode string --- UTF-8 encoding ---> Sequence of bytes
# ( includes '\x' prefix to indicate the next two "digits" are in hexadecimal )

s3 = "What␦"
s3_bytes = "What␦".encode("utf-8")
print("s3_bytes:",s3_bytes)

s4 = "Going\u27F6"
s4_bytes = s4.encode("utf-8")
print("s4_bytes:", s4_bytes)

s3_bytes: b'What\xe2\x90\xa6'
s4_bytes: b'Going\xe2\x9f\xb6'


### Decimal, Binary and Hexadecimal
* A byte is represented as a Hexadecimal number because it best represents an 8 bits, $2^8 = 256$ numbers going from 0 to 255

|   Decimal   | Base 10 |     0    |     1    |     2    |     3    | ... |    14    |    15    |    16    | ... | 31       |    32    | 33       | ... |    254   |    255   |
|:-----------:|:-------:|:--------:|:--------:|:--------:|:--------:|:---:|:--------:|:--------:|:--------:|:---:|----------|:--------:|----------|:---:|:--------:|:--------:|
|    Binary   |  Base 2 | 00000000 | 00000001 | 00000010 | 00000011 | ... | 00001110 | 00001111 | 00010000 | ... | 00011111 | 00100000 | 00100001 | ... | 11111110 | 11111111 |
| Hexadecimal | Base 16 |     0    |     1    |     2    |     3    | ... |     E    |     F    |    10    | ... | 1F       |    20    | 21       | ... |    FE    |    FF    |

In [42]:
# Hexadecimal
hex_byte = "\xe2" # 'â'
hex_byte

'â'

In [43]:
# Decimal
ord(hex_byte) # 22

226

In [44]:
# Binary
bin(ord(hex_byte))

'0b11100010'

In [46]:
# Creating a bytes type object
type(b'Bytes Object')

bytes

In [49]:
# Decoding a bytes type object
decoded_bytes = b'Bytes Object'.decode('utf-8')
type(decoded_bytes)

str

# Example - CIA Sentences

In [62]:
import pandas as pd

sentences_cia = pd.read_csv("sentences_cia.csv", encoding="utf-8")
sentences_cia.head(11)

Unnamed: 0,year,statement,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,1997,The FBI information included that al-Mairi's b...,,,
1,1997,The FBI information included that al-Mairi's b...,,,
2,1997,"For example, on October 12, 2004, another CIA ...",,,
3,1997,"On October 16, 2001, an email from a CTC offic...",,,
4,1997,"For example, on October 12, 2004, another CIA ...",,,
5,1997,"On October 16, 2001, an email from a CTC offic...",,,
6,1998,CIA Interrogators Disagree with CIA Headquarte...,,,
7,1998,"""^^''^ There was also CIA reporting in 1998 th...",,,
8,1998,"'^^^ Prior to Abu Zubaydah's capture, the CIA ...",,,
9,1998,• • • • 1759 (0213I9ZQCT04);HEADQUARTERSl|^Hm4...,,,


In [77]:
sentences_cia.iloc[9]["statement"]

'• • • • 1759 (0213I9ZQCT04);HEADQUARTERSl|^Hm40023ZNOV05);| (171225Z NOV 04); (140915Z NOV 04); (06I620Z DEC 04); 2207(1113I9Z APR 05)I^^^Hl2210a^7Z APR05)~|^|H2535 (051805Z JUL05); ^ ^ • • ^ 9 (120857ZJUL05)n^^^|2830 (29I304Z AUG 05); 1890 (171225Z NOV 040r^^B^^^^893^00831Z NOV 04); CIA document entitled, "Detainee Talking Points for ICRC Rebuttal, 12 April2007T||^BIB2210(141507Z APR 05); •^^••25^(051805Z JUL 05); 2210 (141507Z APR05)r^H|||[B225 (051805Z J U L 0 5 ) I ^ B | | 2830 (291304Z AUG 05); 1930 (061620Z DEC 2210 (141507Z APR 05) 2210(141507Z APR 05); 1691 (081609Z SEP04); 05); 2023 (151735Z JAN 05); (282019Z NOV 03) 1029 (291750Z JUN 06); 17I6(180742Z SEP 04); 2535 (051805ZJUL05); 1_716(180742Z SEP 04); 2515 (301946Z JUN 05); 1_142 (041358Z AUG 06); 3051 (301235Z SEP 05); 2830 (29i304Z AUG 1998 (020752Z JAN 1150 1543 (111600Z AUG04); 1029 (291750Z JUN 06) 1959 (111700Z DEC 04); 2038 (211558ZJAN05)^^^BPHIHi 1091 (031835Z NOV 03); 1266 (052309Z JAN 04); ^••••|^HT63?'

In [78]:
# The integer codes for all the characters we want to keep
good_characters = [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 32]

def clean_statement(row):
    statement = row["statement"]
    clean_statement_list = [char for char in statement if ord(char) in good_characters]
    clean_statement = "".join(clean_statement_list)
    return clean_statement

sentences_cia["cleaned_statement"] = sentences_cia.apply(clean_statement, axis=1)

In [79]:
sentences_cia.iloc[9][["statement","cleaned_statement"]]

statement            • • • • 1759 (0213I9ZQCT04);HEADQUARTERSl|^Hm4...
cleaned_statement        1759 0213I9ZQCT04HEADQUARTERSlHm40023ZNOV0...
Name: 9, dtype: object

In [80]:
# Join all statments together and tokenize on spacing
combined_statements = " ".join(sentences_cia["cleaned_statement"])
statement_tokens = combined_statements.split(sep=" ")

In [81]:
# Filter tokens at least 5 characters long
statement_tokens_series = pd.Series(statement_tokens)
statement_tokens_series = statement_tokens_series[statement_tokens_series.str.len()>4]
filtered_tokens = list(statement_tokens_series)

In [82]:
# Token Counts
pd.Series(filtered_tokens).value_counts()

interrogation                                    391
information                                      375
REDACTED                                         375
Zubaydah                                         328
Committee                                        327
techniques                                       322
August                                           307
September                                        299
provided                                         241
enhanced                                         241
Response                                         237
Ahmad                                            229
detainees                                        224
UNCLASSIFIED                                     221
reporting                                        209
Interrogation                                    207
December                                         189
Intelligence                                     188
alKuwaiti                                     

In [84]:
# Most Counts are
# Token Counts
pd.Series(filtered_tokens).value_counts()[:3]

interrogation    391
information      375
REDACTED         375
dtype: int64