In [None]:
import os
import tarfile
from tqdm import tqdm

# Define the dataset path on Google Drive
from google.colab import drive
drive.mount('/drive')

dataset_path = os.path.join('/drive/MyDrive/Comp 542/IAM/')
  # Adjust this path to the IAM dataset location in your Google Drive
words_tgz_path = os.path.join(dataset_path, "words.tgz")
words_folder_path = os.path.join(dataset_path, "words")

# Check if `words.tgz` exists
if os.path.exists(words_tgz_path):
    print(f"Found '{words_tgz_path}' on Google Drive.")

    # Check if the words folder exists (already extracted)
    if not os.path.exists(words_folder_path):
        print(f"Extracting '{words_tgz_path}'...")

        # Extract the tar.gz file to the `words` folder
        with tarfile.open(words_tgz_path, "r:gz") as tar:
            for member in tqdm(tar.getmembers(), desc="Extracting", unit="file"):
                tar.extract(member, path=words_folder_path)
        print(f"Extraction complete. Files extracted to: {words_folder_path}")
    else:
        print(f"'{words_folder_path}' already exists. Skipping extraction.")
else:
    print(f"Error: '{words_tgz_path}' not found. Please ensure the dataset is in your Google Drive.")


Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).
Found '/drive/MyDrive/Comp 542/IAM/words.tgz' on Google Drive.
'/drive/MyDrive/Comp 542/IAM/words' already exists. Skipping extraction.


In [None]:
dataset, vocab, max_len = [], set(), 0

# Preprocess the dataset by the specific IAM_Words dataset file structure
words = open(os.path.join(dataset_path, "ascii","words.txt"), "r").readlines()
for line in tqdm(words):
    if line.startswith("#"):
        continue

    line_split = line.split(" ")
    if line_split[1] == "err":
        continue
    id_break_down = line_split[0].split("-")

    folder1 = id_break_down[0]
    folder2 = id_break_down[0]+'-'+id_break_down[1]
    file_name = line_split[0] + ".png"
    label = ''
    pixel_width = line_split[5]
    pixel_height = line_split[6]

    for word in line_split[8:]:
      label = label + word.strip()
      line_split.pop(8)

    rel_path = os.path.join(dataset_path, "words", folder1, folder2, file_name)
    if not os.path.exists(rel_path):
        continue

    dataset.append([rel_path, label, pixel_width,pixel_height])
    vocab.update(list(label))
    max_len = max(max_len, len(label))

print(dataset[0])



100%|██████████| 115338/115338 [00:29<00:00, 3947.72it/s]

['/drive/MyDrive/Comp 542/IAM/words/a01/a01-000u/a01-000u-00-00.png', 'A', '27', '51']





In [None]:
print(dataset)
print(len(dataset))

95032


In [None]:
# Handling missing file paths or labels
cleaned_dataset = []
print(dataset)
for entry in dataset:
    file_path, label, width, height = entry
    if not file_path or not os.path.exists(file_path):
        print(f"Missing file: {file_path}, skipping entry.")
        continue
    if int(width) <= 1 or int(height) <= 1:
        print(f"Invalid dimensions for file: {file_path}, skipping entry.")
        continue
    if not label:
        print(f"Missing label for file: {file_path}, setting label as 'UNKNOWN'.")
        label = "UNKNOWN"
    cleaned_dataset.append([file_path, label])

# Replace the dataset with the cleaned version
dataset = cleaned_dataset
# Print to verify
print("Dataset after handling null values:")
print(dataset[:5])  # Print first 5 entries as a sample


In [None]:
# Calculate average and standard deviation of label lengths
import numpy as np

label_lengths = [len(label) for _, label in dataset]
mean_length = np.mean(label_lengths)
std_length = np.std(label_lengths)

# Define outlier bounds
lower_bound = mean_length - 2 * std_length
upper_bound = mean_length + 2 * std_length

# Handle outliers
filtered_dataset = []
for file_path, label in dataset:
    label_length = len(label)
    if label_length < lower_bound or label_length > upper_bound:
        print(f"Outlier detected for file: {file_path}, label: {label}")
        continue  # Skipping outliers
    filtered_dataset.append([file_path, label])

dataset = filtered_dataset

# Print to verify
print("Dataset after removing outliers:")
print(dataset[:5])


Outlier detected for file: /drive/MyDrive/Comp 542/IAM/words/a01/a01-000u/a01-000u-01-00.png, label: nominating
Outlier detected for file: /drive/MyDrive/Comp 542/IAM/words/a01/a01-000u/a01-000u-04-03.png, label: resolution
Outlier detected for file: /drive/MyDrive/Comp 542/IAM/words/a01/a01-000u/a01-000u-06-04.png, label: Manchester
Outlier detected for file: /drive/MyDrive/Comp 542/IAM/words/a01/a01-000x/a01-000x-00-07.png, label: nominating
Outlier detected for file: /drive/MyDrive/Comp 542/IAM/words/a01/a01-000x/a01-000x-03-05.png, label: resolution
Outlier detected for file: /drive/MyDrive/Comp 542/IAM/words/a01/a01-000x/a01-000x-05-02.png, label: Manchester
Outlier detected for file: /drive/MyDrive/Comp 542/IAM/words/a01/a01-003/a01-003-02-03.png, label: Foot-Griffiths
Outlier detected for file: /drive/MyDrive/Comp 542/IAM/words/a01/a01-003/a01-003-02-04.png, label: resolution
Outlier detected for file: /drive/MyDrive/Comp 542/IAM/words/a01/a01-003/a01-003-04-02.png, label: Gover

In [None]:
# Discretize label lengths into bins
bins = [0, 5, 10, 20, 50]  # Define length ranges
bin_labels = ["very short", "short", "medium", "long"]

discretized_dataset = []
for entry in dataset:
    file_path, label = entry
    label_length = len(label)
    bin_index = np.digitize(label_length, bins) - 1
    bin_category = bin_labels[bin_index] if bin_index < len(bin_labels) else "very long"
    discretized_dataset.append([file_path, label, bin_category])

dataset = discretized_dataset

# Print to verify
print("Dataset after discretizing label lengths:")
print(dataset[:5])


Dataset after discretizing label lengths:
[['/drive/MyDrive/Comp 542/IAM/words/a01/a01-000u/a01-000u-00-00.png', 'A', 'very short'], ['/drive/MyDrive/Comp 542/IAM/words/a01/a01-000u/a01-000u-00-01.png', 'MOVE', 'very short'], ['/drive/MyDrive/Comp 542/IAM/words/a01/a01-000u/a01-000u-00-02.png', 'to', 'very short'], ['/drive/MyDrive/Comp 542/IAM/words/a01/a01-000u/a01-000u-00-03.png', 'stop', 'very short'], ['/drive/MyDrive/Comp 542/IAM/words/a01/a01-000u/a01-000u-00-04.png', 'Mr.', 'very short']]


In [None]:
# Extract label lengths from the dataset
label_lengths = [len(label) for _, label, _ in dataset]
min_length = min(label_lengths)
max_length = max(label_lengths)

# Min-max normalization
normalized_lengths = [(length - min_length) / (max_length - min_length) for length in label_lengths]

print("Normalized label lengths:", normalized_lengths[:5])  # Print first 5 normalized lengths


Normalized label lengths: [0.0, 0.375, 0.125, 0.375, 0.25]


In [None]:
# Min-max normalization of label lengths
label_lengths = [len(label) for _, label, _ in dataset]  # Get label lengths

min_length = min(label_lengths)  # Find minimum label length
max_length = max(label_lengths)  # Find maximum label length

# Normalize label lengths and create new dataset
normalized_dataset = []
for file_path, label, bin_category in dataset:  # Iterate over dataset with 3 elements
    label_length = len(label)
    normalized_length = (label_length - min_length) / (max_length - min_length)  # Apply Min-Max normalization
    normalized_dataset.append([file_path, label, bin_category, normalized_length])  # Add normalized length

dataset = normalized_dataset  # Update dataset

# Print to verify
print("Dataset after normalizing label lengths:")
print(dataset[:5])  # Print first 5 entries to verify normalization


Dataset after normalizing label lengths:
[['/drive/MyDrive/Comp 542/IAM/words/a01/a01-000u/a01-000u-00-00.png', 'A', 'very short', 0.0], ['/drive/MyDrive/Comp 542/IAM/words/a01/a01-000u/a01-000u-00-01.png', 'MOVE', 'very short', 0.375], ['/drive/MyDrive/Comp 542/IAM/words/a01/a01-000u/a01-000u-00-02.png', 'to', 'very short', 0.125], ['/drive/MyDrive/Comp 542/IAM/words/a01/a01-000u/a01-000u-00-03.png', 'stop', 'very short', 0.375], ['/drive/MyDrive/Comp 542/IAM/words/a01/a01-000u/a01-000u-00-04.png', 'Mr.', 'very short', 0.25]]


In [None]:
import pandas as pd

# Assuming `dataset` is a list of lists like [(file_path, label, bin_category, normalized_length)]
# Convert it to a DataFrame
df = pd.DataFrame(dataset, columns=["file_path", "label", "bin_category", "normalized_length"])
print(df)
# Save to a CSV file
df.to_csv("dataset.csv", index=False)

print("Dataset saved as dataset.csv successfully!")


                                               file_path   label bin_category  \
0      /drive/MyDrive/Comp 542/IAM/words/a01/a01-000u...       A   very short   
1      /drive/MyDrive/Comp 542/IAM/words/a01/a01-000u...    MOVE   very short   
2      /drive/MyDrive/Comp 542/IAM/words/a01/a01-000u...      to   very short   
3      /drive/MyDrive/Comp 542/IAM/words/a01/a01-000u...    stop   very short   
4      /drive/MyDrive/Comp 542/IAM/words/a01/a01-000u...     Mr.   very short   
...                                                  ...     ...          ...   
90687  /drive/MyDrive/Comp 542/IAM/words/r06/r06-143/...     him   very short   
90688  /drive/MyDrive/Comp 542/IAM/words/r06/r06-143/...      in   very short   
90689  /drive/MyDrive/Comp 542/IAM/words/r06/r06-143/...     the   very short   
90690  /drive/MyDrive/Comp 542/IAM/words/r06/r06-143/...  garden        short   
90691  /drive/MyDrive/Comp 542/IAM/words/r06/r06-143/...       ?   very short   

       normalized_length  


In [None]:
from google.colab import files
files.download("data.csv")

FileNotFoundError: Cannot find file: data.csv

In [None]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Example: Evaluate discretized labels against label lengths
labels = [len(label) for _, label, _, _ in dataset]  # Use label lengths as the target
discretized_features = [bin_category for _, _, bin_category, _ in dataset]  # Discretized categories

# Encode string categories into numeric values
encoder = LabelEncoder()
encoded_features = encoder.fit_transform(discretized_features)  # Convert strings to integers

# Compute mutual information
mi_scores = mutual_info_classif(encoded_features.reshape(-1, 1), labels)
print("Mutual Information Scores:", mi_scores)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Feature set 1: Use only normalized label lengths
normalized_lengths = [norm_length for _, _, _, norm_length in dataset]
X = np.array(normalized_lengths).reshape(-1, 1)  # Reshape for scikit-learn
y = labels  # Target variable (label lengths)

# Train a simple classifier
clf = RandomForestClassifier(random_state=42)
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')  # Cross-validation
print("Accuracy with normalized lengths:", scores.mean())


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Prepare features (e.g., normalized length and bin_category)
normalized_lengths = [norm_length for _, _, _, norm_length in dataset]
bin_categories = [bin_category for _, _, bin_category, _ in dataset]

# Encode categorical features (bin_category) into numeric values
encoder = LabelEncoder()
encoded_categories = encoder.fit_transform(bin_categories)  # Encode strings to integers

# Combine all features into a single array
X = np.column_stack((normalized_lengths, encoded_categories))  # Combine features
y = [len(label) for _, label, _, _ in dataset]  # Target variable (label lengths)

# Train a Random Forest model
clf = RandomForestClassifier(random_state=42)
clf.fit(X, y)

# Feature importance
importances = clf.feature_importances_
print("Feature Importances:", importances)
