<a href="https://colab.research.google.com/github/Tantatorn-dev/crypto_ml/blob/main/classical_cipher_dataset_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction
Based on [this paper](https://scholarworks.sjsu.edu/cgi/viewcontent.cgi?article=1699&context=etd_projects), we'll create some dataset to reproduce the result of this paper.

# Text Preprocessing
We need only capital letters. So I'll omit all special character and capitalize all latin characters.

In [None]:
import re

def preprocess(text):
  text = re.sub(r"[^A-Za-z]+", '', text)
  text = text.upper()
  return text

In [None]:
# try this function
preprocess("dsfsdafe9293Djj22#324")

'DSFSDAFEDJJ'

# Classic Ciphers
There're 4 ciphers referred in the paper. We'll implement those cipher algorithms to generate our dataset.

## Simple Substitution Cipher

In [None]:
import random

# create a key for our cipher 

def make_key():
  alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

  alphabets = list(alphabets)
  random.shuffle(alphabets)
  return ''.join(alphabets)

key = make_key(alphabets)
print(key)

TypeError: ignored

In [None]:
def substitution_encrypt(plaintext, key):
  keymap = dict(zip(alphabets,key))
  return ''.join(keymap.get(c) for c in plaintext)

def substitution_decrypt(cipher, key):
  keymap = dict(zip(key,alphabets))
  return ''.join(keymap.get(c) for c in cipher)

cipher = substitution_encrypt("HELLO", key)
print(cipher)
print(substitution_decrypt(cipher, key))

REZZL
HELLO


## Vigenère Cipher

In [None]:
text = "HELLOMOTOHELLOMOTOHELLO"

In [None]:
# create a cyclic key
def make_cyclic_key(text, key):
    key = list(key) 
    if len(text) == len(key): 
        return(key) 
    else: 
        for i in range(len(text) - 
                       len(key)): 
            key.append(key[i % len(key)]) 
    return "" . join(key) 

In [None]:
def vigenere_encrypt(plaintext, key):

  key = make_cyclic_key(plaintext, key)

  cipher = []
  for i in range(len(plaintext)):
    x = (ord(plaintext[i]) + ord(key[i])) % 26
    x += ord('A') 
    cipher.append(chr(x)) 
  return "".join(cipher)

cipher = vigenere_encrypt(text, "MOTTT")
print(cipher)

TSEEHYCMHAQZEHFAHHAXXZH


## Column Transposition Cipher


In [None]:
import math

def split_len(seq, length):
    return [seq[i:i + length] for i in range(0, len(seq), length)]

def column_transpostion_encrypt(plaintext, key):

    order = {
        int(val): num for num, val in enumerate(key)
    }

    ciphertext = ''
    for index in sorted(order.keys()):
        for part in split_len(plaintext, len(key)):
            try:
                ciphertext += part[order[index]]
            except IndexError:
                continue

    return ciphertext

print(column_transpostion_encrypt('IHAVETWOCATSANDTHREEDOGSANDFIVEPARROTS', '3214'))

AWTDEGDERHTANRONVRSIECAHDAIATVOSTESFPO


## Playfair Cipher

In [None]:
def matrix(key):
	matrix=[]
	for e in key.upper():
		if e not in matrix:
			matrix.append(e)
	alphabet="ABCDEFGHIKLMNOPQRSTUVWXYZ"
	
	for e in alphabet:
		if e not in matrix:
			matrix.append(e)	
	
	#initialize a new list. Is there any elegant way to do that?
	matrix_group=[]
	for e in range(5):
		matrix_group.append('')

	#Break it into 5*5
	matrix_group[0]=matrix[0:5]
	matrix_group[1]=matrix[5:10]
	matrix_group[2]=matrix[10:15]
	matrix_group[3]=matrix[15:20]
	matrix_group[4]=matrix[20:25]
	return matrix_group

def message_to_digraphs(message_original):

	message=[]
	for e in message_original:
		message.append(e)

	for unused in range(len(message)):
		if " " in message:
			message.remove(" ")

	i=0
	for e in range(int(len(message)/2)):
		if message[i]==message[i+1]:
			message.insert(i+1,'X')
		i=i+2

	if len(message)%2==1:
		message.append("X")
	
	i=0
	new=[]
	for x in range(1,int(len(message)/2+1)):
		new.append(message[i:i+2])
		i=i+2
	return new

def find_position(key_matrix,letter):
	x=y=0
	for i in range(5):
		for j in range(5):
			if key_matrix[i][j]==letter:
				x=i
				y=j

	return x,y

def playfair_encrypt(message, key):
	
	message=message_to_digraphs(message)
	key_matrix=matrix(key)
	cipher=[]
	for e in message:
		p1,q1=find_position(key_matrix,e[0])
		p2,q2=find_position(key_matrix,e[1])
		if p1==p2:
			if q1==4:
				q1=-1
			if q2==4:
				q2=-1
			cipher.append(key_matrix[p1][q1+1])
			cipher.append(key_matrix[p1][q2+1])		
		elif q1==q2:
			if p1==4:
				p1=-1;
			if p2==4:
				p2=-1;
			cipher.append(key_matrix[p1+1][q1])
			cipher.append(key_matrix[p2+1][q2])
		else:
			cipher.append(key_matrix[p1][q2])
			cipher.append(key_matrix[p2][q1])
	return "".join(cipher)

print(playfair_encrypt("HELLOMOTOTHATISNOTGOOD","BALCLA"))

IFGLCNNPYTYOCRGUOPSHNYPC


# Dataset Generator
After we've all algorithms. We're going to generate our dataset. From the paper we need about 1000 texts in 4 categories. Texts will have a length of 10 to 1000.


In [None]:
# prepare the plaintexts
%cd /content/drive/My Drive/Kaggle/cipher_text_challenge

/content/drive/My Drive/Kaggle/cipher_text_challenge


In [None]:
%ls

sample_submission.csv  test.csv  training.csv


In [None]:
import pandas as pd

text_dataset = pd.read_csv("training.csv")
text_dataset.head()

Unnamed: 0,plaintext_id,text,index
0,ID_2ed3c75e8,Saying Bambi 2 is better than the original is ...,1228
1,ID_d51637e35,I detest slapstick and even as a child I could...,22903
2,ID_096b6847e,If you want to see a movie about things that w...,44844
3,ID_3f2297975,"The Flesh and the Fiends, also known as Mania,...",10556
4,ID_9431252bd,I am at a loss of words after watching this on...,18562


In [None]:
 # we need only text
texts = text_dataset["text"]
texts.head()

0    Saying Bambi 2 is better than the original is ...
1    I detest slapstick and even as a child I could...
2    If you want to see a movie about things that w...
3    The Flesh and the Fiends, also known as Mania,...
4    I am at a loss of words after watching this on...
Name: text, dtype: object

In [None]:
texts.size

44682

In [None]:
# preprocess text
preprocessed = []
for i in texts:
  preprocessed.append(preprocess(i))

print(preprocessed[0:10])

['SAYINGBAMBIISBETTERTHANTHEORIGINALISACOMPLETEUNDERSTATEMENTTHEFILMISPERFECTONEVERYLEVELOFFILMMAKINGITSANINTOXICATINGFEELINGWHENAMOVIEEXCITESANDENLIVENSUSLIKETHISANDTHERESAPARTICULARGIDDINESSTOBEHADINTHINKINGABOUTWHATMOVIESCANBUTDONTOFTENDOFORONESSOULAFTERIMBIBINGSUCHAFINEVINTAGEHONESTLYITISAMAZINGAMUSTSEEFILMDISTURBINGBRILLIANTFANTASTICTHEFILMSSCREENPLAYISAMAZINGFANTASTICWRITINGUNPREDICTABLESERIOUSLYDIDNTKNOWWHATTOEXPECTNEXTTHEVOICEWORKICONSIDERTOBEOSCARWORTHYACTINGACOMPLETEMASTERPIECE', 'IDETESTSLAPSTICKANDEVENASACHILDICOULDNEVERUNDERSTANDWHYANAUDIENCELAUGHEDWHENPEOPLEGOTPOKEDINTHEEYETHEUNSPEAKABLETHREESTOOGESFELLDOWNTHEMAWKISHTIRESOMECHARLIECHAPLINORRANINTOANDDESTROYEDTHINGSTHEINEFFABLERITZBROTHERSTHISISTHEONLYMOVIEIHAVEEVERSEENINMYLIFEWHEREINOTONLYTHOUGHTTHESLAPSTICKWASHILARIOUSTRYINGTOIMPRESSTHEBLINDDATEBUTITMADEMELAUGHOUTLOUDSOMETHINGIDONTTHINKIVEDONEMORETHANTWOORTHREETIMESINWATCHINGAMOVIETHISISCOMICPERFECTIONFROMBEGINNINGTOENDANDNOTEVENTHEDISMALDISLIKABLEANNOYINGDIANEKEATONWHAT

## Let's encrypt our plaintext
We'll divide our texts to each classes equally.

Classes
1.   substitution (0)
2.   Vigenere (1)
3. column transposition (2)
4. playfair (3)



In [None]:
# a size of dataset in each class
class_size = int(texts.size / 4)
print(class_size)

11170


In [None]:
# shuffle our texts
random.shuffle(preprocessed)

In [None]:
# divide to 4 class
class_0 = preprocessed[0:class_size]
class_1 = preprocessed[class_size:2*class_size]
class_2 = preprocessed[2*class_size:3*class_size]
class_3 = preprocessed[3*class_size:]

# throw these texts into encryption algo
encrypted_class_0 = [substitution_encrypt(i, make_key()) for i in class_0]
encrypted_class_1 = [vigenere_encrypt(i,"HELLO") for i in class_1]
encrypted_class_2 = [column_transpostion_encrypt(i,'3214') for i in class_2]
encrypted_class_3 = [playfair_encrypt(i,"BALCLA") for i in class_3]

## Create a dataframe for our dataset

In [None]:
df0 = pd.DataFrame((list(zip(class_0, encrypted_class_0, [0]*len(class_0)))), columns=["plaintext", "cipher", "class"])
df1 = pd.DataFrame((list(zip(class_1, encrypted_class_1, [1]*len(class_1)))), columns=["plaintext", "cipher", "class"])
df2 = pd.DataFrame((list(zip(class_2, encrypted_class_2, [2]*len(class_2)))), columns=["plaintext", "cipher", "class"])
df3 = pd.DataFrame((list(zip(class_3, encrypted_class_3, [3]*len(class_3)))), columns=["plaintext", "cipher", "class"])

In [None]:
# assemble all dataframe of each classes
df = pd.concat([df0, df1, df2, df3])
print(df)

                                               plaintext  ... class
0      IFYOUENJOYACTIONORIENTEDPERIODFILMSANDYOUARENT...  ...     0
1      FORMETHISISTHEBESTROLEOFHACKMANSCAREERANDONEOF...  ...     0
2      THISMOVIEWASPRETTYDARNAWFULBROOKESHIELDSACTING...  ...     0
3      GIRLSHYFINDSHAROLDLLOYDEXACTLYTHATAPOORKIDWHOW...  ...     0
4      INAYEARTHATGAVEUSFORRESTGUMPPULPFICTIONANDTHES...  ...     0
...                                                  ...  ...   ...
11167  WHENIHADFIRSTHEARDABOUTTHISMOVIEAROUNDABOUTLAS...  ...     3
11168  TWOQUESTIONSARISEWHENWATCHINGAFILMMIXINGTHESEG...  ...     3
11169  THISISNOTAMOVIEFOREVERYONEESPECIALLYWITHTHEPRO...  ...     3
11170  INTRIGINGLYCONVOLUTEDANDHAUNTINGLYREFRESHINGTH...  ...     3
11171  LOVEMEORLEAVEMEISNOTYOURTYPICALMUSICALTHATISNO...  ...     3

[44682 rows x 3 columns]


## Write to a CSV file

In [None]:
df.to_csv('classic_cipher_dataset.csv', index=False)