# Statistical Analysis


In [1]:
from collections import Counter
import os

ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ,.-"
MOD = len(ALPHABET)


def read_ciphertext(filename):
    with open(filename, "r", encoding="utf-8") as f:
        text = f.read().strip()
    return "".join(c for c in text if c in ALPHABET)


def ngrams(text, n):
    return [text[i:i+n] for i in range(len(text) - n + 1)]


def frequency_table(items):
    total = len(items)
    counter = Counter(items)
    return counter, total


def index_of_coincidence(text):
    freq = Counter(text)
    N = len(text)
    return sum(f * (f - 1) for f in freq.values()) / (N * (N - 1))


def format_table(counter, total, limit=15):
    lines = []
    for i, (k, v) in enumerate(counter.most_common()):
        if i >= limit:
            break
        perc = v / total * 100
        lines.append(f"{k:>4} : {v:>6} ({perc:5.2f}%)")
    return "\n".join(lines)


In [15]:
def analyze_cipher_to_file(cipher_id, base_path="177-Student"):
    input_file = os.path.join(base_path, f"{cipher_id}.txt")
    output_dir = os.path.join(base_path, "analysis_results")
    os.makedirs(output_dir, exist_ok=True)

    output_file = os.path.join(output_dir, f"analysis_{cipher_id}.txt")

    text = read_ciphertext(input_file)

    mono_counter, mono_total = frequency_table(text)
    di_counter, di_total = frequency_table(ngrams(text, 2))
    tri_counter, tri_total = frequency_table(ngrams(text, 3))

    ic = index_of_coincidence(text)

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(f"Statistical Analysis – Cipher {cipher_id}\n")
        f.write("=" * 40 + "\n\n")

        f.write(f"Text length: {len(text)}\n")
        f.write(f"Index of Coincidence: {ic:.5f}\n\n")

        f.write("1-grams (characters)\n")
        f.write("-" * 25 + "\n")
        f.write(format_table(mono_counter, mono_total) + "\n\n")

        f.write("2-grams (digrams)\n")
        f.write("-" * 25 + "\n")
        f.write(format_table(di_counter, di_total) + "\n\n")

        f.write("3-grams (trigrams)\n")
        f.write("-" * 25 + "\n")
        f.write(format_table(tri_counter, tri_total) + "\n")

    print(f"✔ Analysis for cipher {cipher_id} saved to {output_file}")


In [16]:
for i in range(4):
    analyze_cipher_to_file(i)


✔ Analysis for cipher 0 saved to 177-Student\analysis_results\analysis_0.txt
✔ Analysis for cipher 1 saved to 177-Student\analysis_results\analysis_1.txt
✔ Analysis for cipher 2 saved to 177-Student\analysis_results\analysis_2.txt
✔ Analysis for cipher 3 saved to 177-Student\analysis_results\analysis_3.txt


The statistical analysis of all four ciphertexts was performed using a custom Python notebook.
For each ciphertext, frequency distributions of characters, digrams and trigrams, as well as the index of coincidence, were computed and saved to individual analysis files

# Classification

In [5]:
ENGLISH_FREQ = {
    'E': 11.93, 'T': 8.80, 'A': 7.90, 'O': 7.52, 'I': 6.82, 'H': 6.42,
    'N': 6.23, 'S': 5.99, 'R': 5.54, 'D': 4.36, 'L': 4.00,
    'U': 2.83, 'M': 2.64, 'W': 2.37, 'Y': 2.24, 'F': 2.11,
    'C': 2.10, 'G': 1.89, ',': 1.70, '.': 1.50, 'P': 1.47,
    'B': 1.37, 'V': 0.92, 'K': 0.75, '-': 0.21, 'X': 0.14,
    'Q': 0.11, 'J': 0.09, 'Z': 0.04
}


In [21]:
def chi_square_stat(text):
    N = len(text)
    freq = Counter(text)
    chi2 = 0
    for c in ALPHABET:
        observed = freq.get(c, 0)
        expected = ENGLISH_FREQ.get(c, 0) * N / 100
        if expected > 0:
            chi2 += (observed - expected) ** 2 / expected
    return chi2


In [22]:
def avg_ic_for_period(text, period):
    ics = []
    for i in range(period):
        stream = text[i::period]
        if len(stream) > 1:
            ics.append(index_of_coincidence(stream))
    return sum(ics) / len(ics)


In [23]:
def classify_cipher_robust(text):
    ic = index_of_coincidence(text)

    if ic > 0.055:
        # Cesare vs Sostituzione
        chi2 = chi_square_stat(text)
        if chi2 < 150:
            return "Caesar"
        else:
            return "Substitution"

    else:
        # Vigenere vs Hill
        ic5 = avg_ic_for_period(text, 5)
        if ic5 > 0.055:
            return "Vigenere"
        else:
            return "Hill"


In [24]:
results = {}
for i in range(4):
    text = read_ciphertext(f"177-Student/{i}.txt")
    results[i] = classify_cipher_robust(text)

for k, v in results.items():
    print(f"Cipher {k}: {v}")


Cipher 0: Substitution
Cipher 1: Substitution
Cipher 2: Hill
Cipher 3: Vigenere


The identification of the cipher types was partially automated by extracting statistical features such as the index of coincidence, frequency variance, and dominant n-gram distributions using a Python script.
Based on these features, each ciphertext was classified and subsequently decrypted using appropriate tools.
The automatic classification was used as a support tool; final decisions were validated manually through frequency analysis.
An automatic classification based on statistical features was initially used to support the identification of the cipher types.
Since Caesar ciphers are a special case of monoalphabetic substitution, the final classification was refined manually by comparing the frequency distributions with the reference English statistics.

# CESAR

In [2]:
def caesar_decrypt(text, shift):
    result = ""
    for c in text:
        idx = ALPHABET.index(c)
        result += ALPHABET[(idx - shift) % MOD]
    return result


def brute_force_caesar(filename):
    text = read_ciphertext(filename)
    for shift in range(MOD):
        candidate = caesar_decrypt(text, shift)
        print(f"\nSHIFT = {shift}")
        print(candidate[:400])


In [12]:
brute_force_caesar("177-Student/1.txt")



SHIFT = 0
J,XKIKTBXYYXVJEYJTA.DZTMTOTBBWXI.HXYEHVECCKD.VTJ.EDUOCTA.DZJ,X.HMEHWIIEKDWJ,.DTDWICTBBTDWQTYJXHMTBA.DZHEKDWJ,XWXVAJ,HXXEHYEKHJ.CXIQJ,XOVBKIJXHXWJEZXJ,XHQOTMD.DZWXXFBOQTDWBEEA.DZTJJ,XITCXIFEJEYWXXFZBEECEDJ,XUTDAIRCKHCKH.DZLXHOBEM.DJ,XH,OJ,C.VTBJEDXEYEDXEFFHXIIXWUOJ,XT.HQCHIRYBKI,.DZUXZTDJEMEDWXHM,XHXJ,XOMXHXJEIBXXFQYEHJ,XOVEKBWDEJIBXXFWEMDIJT.HIQJ,XOVEKBWDEJIBXXF.DTWEZ,EBXICXBB.DZEYE.BQJ,XOVEKBWDEJ

SHIFT = 1
IZWJHJSAWXXWUIDXIS-,CYSLSNSAAVWH,GWXDGUDBBJC,USI,DCTNBS-,CYIZW,GLDGVHHDJCVIZ,CSCVHBSAASCVPSXIWGLSA-,CYGDJCVIZWVWU-IZGWWDGXDJGI,BWHPIZWNUAJHIWGWVIDYWIZWGPNSLC,CYVWWEANPSCVADD-,CYSIIZWHSBWHEDIDXVWWEYADDBDCIZWTSC-HQBJGBJG,CYKWGNADL,CIZWGZNIZB,USAIDCWDXDCWDEEGWHHWVTNIZWS,GPBGHQXAJHZ,CYTWYSCIDLDCVWGLZWGWIZWNLWGWIDHAWWEPXDGIZWNUDJAVCDIHAWWEVDLCHIS,GHPIZWNUDJAVCDIHAWWE,CSVDYZDAWHBWAA,CYDXD,APIZWNUDJAVCDI

SHIFT = 2
HYVIGIR-VWWVTHCWHR.ZBXRKRMR--UVGZFVWCFTCAAIBZTRHZCBSMAR.ZBXHYVZFKCFUGGCIBUHYZBRBUGAR--RBUORWHVFKR-.ZBXFCIBUHYVUVT.HYFVVCFWCIFHZAVGOHYVMT-IGHVFVUHCXVHYVFOMRKBZBXUVVD-

THE USUAL EFFECT OF TAKING AWAY ALL DESIRE FOR COMMUNICATION BY MAKING THEIR WORDS SOUND THIN AND SMALL AND, AFTER WALKING ROUND THE DECK THREE OR FOUR TIMES, THEY CLUSTERED TOGETHER, YAWNING DEEPLY, AND LOOKING AT THE SAME SPOT OF DEEP GLOOM ON THE BANKS. MURMURING VERY LOW IN THE RHYTHMICAL TONE OF ONE OPPRESSED BY THE AIR, MRS. FLUSHING BEGAN TO WONDER WHERE THEY WERE TO SLEEP, FOR THEY COULD NOT SLEEP DOWNSTAIRS, THEY COULD NOT SLEEP IN A DOG-HOLE SMELLING OF OIL, THEY COULD NOT

In [3]:
def split_by_key_length(text, k):
    return [''.join(text[i::k]) for i in range(k)]


def average_ic_for_k(text, k):
    parts = split_by_key_length(text, k)
    return sum(index_of_coincidence(p) for p in parts) / k


def guess_key_length(text, max_k=20):
    results = []
    for k in range(1, max_k + 1):
        ic = average_ic_for_k(text, k)
        results.append((k, ic))
        print(f"k={k:2d}  IC={ic:.4f}")
    return results


In [4]:
k=5  IC=0.064
k=10 IC=0.063


SyntaxError: invalid syntax (609137164.py, line 1)

In [6]:
EN_FREQ = {
    'A': 0.08167, 'B': 0.01492, 'C': 0.02782, 'D': 0.04253,
    'E': 0.12702, 'F': 0.02228, 'G': 0.02015, 'H': 0.06094,
    'I': 0.06966, 'J': 0.00153, 'K': 0.00772, 'L': 0.04025,
    'M': 0.02406, 'N': 0.06749, 'O': 0.07507, 'P': 0.01929,
    'Q': 0.00095, 'R': 0.05987, 'S': 0.06327, 'T': 0.09056,
    'U': 0.02758, 'V': 0.00978, 'W': 0.02360, 'X': 0.00150,
    'Y': 0.01974, 'Z': 0.00074
}
