In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import glob

## Utility Functions

In [7]:
def extract_form(form):
  actual_form = form.split('-')
  return actual_form[0] + '-' + actual_form[1]

## IAM Dataset Distribution

In [4]:
iam_writer_info = pd.read_csv('../Data/Writers-info.csv')
iam_writer_info = iam_writer_info.loc[iam_writer_info['NativeLanguage'] != 'Swiss German']
iam_writer_info['NativeLanguage'].value_counts()

NativeLanguage
German    19
French    16
Name: count, dtype: int64

## Bristol-Corpus Distribution

In [None]:
bristol_writer_info = pd.read_csv('../Data/Bristol-Corpus/English-GrayScale/english_df.csv')
bristol_writer_info['NativeLanguage'].value_counts()


NativeLanguage
English    22
Name: count, dtype: int64

## Comparison Between Two Datasets

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

writers_info = pd.read_csv('./Final-Data/new-writers-info.csv')
writers_info = writers_info.loc[writers_info['NativeLanguage'] != 'English']
feature_store = pd.read_csv('./Final-Data/All-Writers-Feature-Vectors-GrayScale.csv', index_col=[0])
images_per_writer_per_class = feature_store[['name', 'form']]
images_per_writer_per_class = pd.merge(images_per_writer_per_class, writers_info[['name', 'NativeLanguage']], how='inner', on='name')
images_per_writer_per_class['cnt'] = 1
images_per_writer_per_class = images_per_writer_per_class.rename(columns={'form': 'image'})
images_per_writer_per_class['form'] = np.vectorize(extract_form)(images_per_writer_per_class['image'])

# display(images_per_writer_per_class)
images_per_writer_per_class_group = images_per_writer_per_class.groupby(['name','NativeLanguage'], as_index=False).sum()
display(images_per_writer_per_class_group)

# IAM-DB Plot
sns.histplot(data=images_per_writer_per_class_group['cnt'], kde=True, ax=axes[0], bins=6, color="skyblue")
axes[0].set_title("IAM-DB (German & French Writers)")
axes[0].set_xlabel("Number of Sentence-Level Images per Writer")
axes[0].set_ylabel("Number of Writers")

english_writer_info = pd.read_csv('../Data/Bristol-Corpus/English-GrayScale/english_df.csv')

num_img_list = []
for i in english_writer_info['images_path']:
  i = ast.literal_eval(i)
  for j in i:
    j = j.replace('./', '../')
    image_list = glob.glob(j)
    # print(image_list)
    num_img_list.append(len(image_list))
    
english_writer_info['num_img'] = num_img_list
# display(english_writer_info)
# Bristol Plot
sns.histplot(data=english_writer_info['num_img'], kde=True, ax=axes[1], bins=6, color="lightgreen")
axes[1].set_title("Bristol Corpus (English Writers)")
axes[1].set_xlabel("Number of Sentence-Level Images per Writer")
axes[1].set_ylabel("")

plt.tight_layout()