In [None]:
# Import required libraries
!python -m spacy download zh_core_web_sm #download chinese pipeline if need to
import re
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from collections import Counter

In [None]:
# ======== Task 1: Process the English Dataset ========
# Load CSV Dataset
file_path = "/data/English_Dataset.csv"
df = pd.read_csv(file_path, encoding="utf-8")
# Display the first few rows to verify the dataset structure
print(df.head())

# Text Cleaning Using spaCy

# Load spaCy English language model
nlp = spacy.load("en_core_web_sm")

# Define the cleaning function

def clean_en_text(text):
    """
    Cleans input text by:
    - Converting to string to handle potential NaN values
    - Removing HTML tags
    - Converting text to lowercase
    - Tokenizing using spaCy
    - Keeping only alphabetic words (removes punctuation and numbers)
    """
    text = str(text)
    text = re.sub(r'<.*?>', '', text)
    doc = nlp(text.lower())
    words = [token.text for token in doc if not token.is_punct and not token.like_num]
    return words

# Apply the cleaning function to the 'Text' column
df['cleaned_text'] = df['Text'].apply(clean_en_text)

# Compute Word Frequencies and Total Word Count

# Flatten the cleaned word lists into a single list
all_en_words = [word for words in df['cleaned_text'] for word in words]

# Compute the total number of words (including duplicates)
total_en_word_count = len(all_en_words)
print(f"Total number of English words: {total_en_word_count}")


# Count occurrences of each unique word
en_word_counts = Counter(all_en_words)

# Compute Word Length Frequencies (Unaggregated)

# Create a dictionary {en_word_length: total_frequency}

en_length_freq = {} # Initialize an empty dictionary to store the frequency of word lengths
for word, freq in en_word_counts.items():# Iterate over all words and their occurrence frequencies
    en_word_length = len(word) # Calculate the length of the word (i.e., the number of letters)
    en_length_freq[en_word_length] = en_length_freq.get(en_word_length, 0) + freq # Accumulate the frequency of words with the same length

print("English Word Length Frequency Dictionary (Individual Lengths):\n")
for length in sorted(en_length_freq.keys()):
    print(f"Length {length}: {en_length_freq[length]}")

# Plot the frequency distribution of individual word lengths
x_values = sorted(en_length_freq.keys())
y_values = [en_length_freq[en_word_length] for en_word_length in x_values]

plt.figure(figsize=(10, 6))
plt.plot(x_values, y_values, marker='o', linestyle='-', color='green',label= 'English')
for x, y in zip(x_values, y_values):
    plt.text(x, y, f'{y}', ha='center', va='bottom', fontsize=10)
plt.xlabel("English Word Length (number of letters)")
plt.ylabel("English Total Frequency")
plt.title("Original English Word Length Frequency Distribution")
plt.grid(True)
plt.legend()
plt.show()

# Aggregate Word Lengths into Groups

# Define grouped bins: {1-3, 4-6, 7-9, 10-12, 13-15, 16-18}

group_bins = {"1-3": 0, "4-6": 0, "7-9": 0, "10-12": 0, "13-15": 0, "16-18": 0}
for length, freq in en_length_freq.items():
    if 1 <= length <= 3:
        group_bins["1-3"] += freq
    elif 4 <= length <= 6:
        group_bins["4-6"] += freq
    elif 7 <= length <= 9:
        group_bins["7-9"] += freq
    elif 10 <= length <= 12:
        group_bins["10-12"] += freq
    elif 13 <= length <= 15:
        group_bins["13-15"] += freq
    elif 16 <= length <= 18:
        group_bins["16-18"] += freq

print("Aggregated Grouped English Word Length Frequency:")
for group in group_bins:
    print(f"Group {group}: {group_bins[group]}")

# Plot the frequency distribution of grouped english word lengths
groups_order = ["1-3", "4-6", "7-9", "10-12", "13-15", "16-18"]
group_freqs = [group_bins[group] for group in groups_order]
x_positions = range(len(groups_order))

plt.figure(figsize=(10, 6))
plt.plot(x_positions, group_freqs, marker='o', linestyle='-', color='blue', label= 'English')
plt.xticks(x_positions, groups_order)
for x, y in zip(x_positions, group_freqs):
    plt.text(x, y, f'{y}', ha='center', va='bottom', fontsize=10)
plt.xlabel("English Word Length Group")
plt.ylabel("English Total Frequency")
plt.title("Aggregated English Word Length Frequency Distribution")
plt.grid(True)
plt.legend()
plt.show()

# Calculate Spearman's Correlation
# Spearman's correlation measures the relationship between word length and frequency
rho, p_value = spearmanr(x_values, y_values)

print(f"Spearman's Correlation Coefficient: {rho:.4f}")
print(f"P-value: {p_value:.4f}")

if rho < -0.8 and p_value <= 0.05:
    print("✅ Strong negative correlation detected. Zipf’s Law of Abbreviation is supported!")
elif -0.8 <= rho < -0.5 and p_value <= 0.05:
    print("⚠️ Moderate negative correlation detected. Zipf’s Law of Abbreviation is partially supported.")
elif -0.5 <= rho <= 0.3 and p_value <= 0.05:
    print("❓ Weak negative correlation detected. There is some evidence, but it is not strong.")
elif -0.3 <= rho <= 0 and p_value <= 0.05:
    print("❌ Very weak or no correlation. Zipf’s Law of Abbreviation is not supported.")
else:
    print("❓ No statistically significant correlation (p-value > 0.05). The results are inconclusive.")

In [None]:
# ======== Task 2: Process the English Dataset ========
# Load Chinese language model
nlp = spacy.load("zh_core_web_sm")

# Read text file
with open('CLTS.txt', 'r', encoding='utf-8') as file:
    cn_texts = file.readlines()

# Text processing pipeline
all_words = []
for text in cn_texts:
    # Clean text
    clean_text = re.sub(r'[^\u4e00-\u9fa5]', '', text.strip())

    # Process with spacy
    doc = nlp(clean_text)

    # Extract tokens with filtering
    words = [token.text for token in doc
             if not token.is_punct        # Remove punctuation
             and not token.is_space       # Remove whitespace
             and token.text.strip()]      # Remove empty strings

    all_words.extend(words)
    # Calculate length frequency
cn_length_freq = Counter([len(word) for word in all_words])

# Sort by word length
sorted_lengths = sorted(cn_length_freq.items())
B = [item[0] for item in sorted_lengths]  # word lengths
C = [item[1] for item in sorted_lengths]  # frequencies

# Calculate Spearman's correlation
rho, p_value = stats.spearmanr(B, C)

# Create plot
plt.figure(figsize=(10, 6))

# Set x-axis ticks
max_length = max(B)
plt.xticks(np.arange(0, max_length + 1, 1))

# Plot line and points
plt.plot(B, C, 'ro--', alpha=0.8, linewidth=1, label='Chinese')

# Add labels and title
plt.legend()
plt.title('Word Length Frequency in Chinese Text')
plt.xlabel('Word Length')
plt.ylabel('Frequency')

# Add frequency numbers on points
for x, y in zip(B, C):
    plt.text(x, y, str(y), ha='center', va='bottom', fontsize=8)

plt.grid(True, linestyle='--', alpha=0.3)
plt.show()

# Print statistics
print("\nWord Length Statistics:")
for length, freq in sorted_lengths:
    print(f"Length {length}: {freq}")

# Aggregate Word Lengths into Groups
# Define grouped bins
group_bins = {"1-2": 0, "3-4": 0, "5-6": 0, "7-8": 0, "9-10": 0, "11-14": 0}
for length, freq in cn_length_freq.items():
    if 1 <= length <= 2:
        group_bins["1-2"] += freq
    elif 3 <= length <= 4:
        group_bins["3-4"] += freq
    elif 5 <= length <= 6:
        group_bins["5-6"] += freq
    elif 7 <= length <= 8:
        group_bins["7-8"] += freq
    elif 9 <= length <= 10:
        group_bins["9-10"] += freq
    elif 11 <= length <= 14:
        group_bins["11-14"] += freq

print("Aggregated Grouped Chinese Word Length Frequency:")
for group in group_bins:
    print(f"Group {group}: {group_bins[group]}")

# Plot the frequency distribution of grouped english word lengths
groups_order = ["1-2", "3-4", "5-6", "7-8", "9-10", "11-14"]
group_freqs = [group_bins[group] for group in groups_order]
x_positions = range(len(groups_order))

plt.figure(figsize=(10, 6))
plt.plot(x_positions, group_freqs, marker='o', linestyle='-', color='blue', label= 'Chinese')
plt.xticks(x_positions, groups_order)
for x, y in zip(x_positions, group_freqs):
    plt.text(x, y, f'{y}', ha='center', va='bottom', fontsize=10)
plt.xlabel("Chinese Word Length Group")
plt.ylabel("Total Frequency")
plt.title("Aggregated Grouped Chinese Word Length Frequency Distribution")
plt.grid(True)
plt.legend()
plt.show()


print(f"\nSpearman's correlation: rho = {rho:.3f} (p = {p_value:.4f})")
if p_value < 0.05:
    if rho < 0:
        print("Results support Zipf's Law of Abbreviation (significant negative correlation)")
    else:
        print("Results do not support Zipf's Law of Abbreviation (significant positive correlation)")
else:
    print("Results are not statistically significant")