In [1]:
# pip install -U scikit-learn
import matplotlib.pyplot as plt
import os
import numpy as np
import random
import cv2
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import load_img, img_to_array

In [2]:
PATH = "/Users/preethi/MS DS/DeepLearning/Final Project/omniglot_combined"

# Get all alphabet folders
alphabet_folders = [os.path.join(PATH, folder)
                    for folder in os.listdir(PATH)
                        if os.path.isdir(os.path.join(PATH, folder))]

print(f"The dataset contains {len(alphabet_folders)} alphabets.")

# Get all character folders
character_folders = [os.path.join(alphabet_path, character)
                     for alphabet_path in alphabet_folders
                         for character in os.listdir(alphabet_path)
                             if os.path.isdir(os.path.join(alphabet_path, character))]

# Sort alphabetically 
character_folders = sorted(character_folders)

print(f"The dataset contains {len(character_folders)} characters.")

# Get all images
all_files = list()
for (dirpath, dirnames, filenames) in os.walk(PATH):
    all_files += [os.path.join(dirpath, file) for file in filenames if not file.startswith('.')]
print(f"The dataset contains {len(all_files)} images.")


The dataset contains 50 alphabets.
The dataset contains 1623 characters.
The dataset contains 32460 images.


In [3]:
def read_and_propocess_image(img_path, image_dim=None):
    """
    Takes an image path and returns a grayscale, resized, and flattened image as array
    
    Args:
    * img_path: Full path to file
    * image_dim: Resized image shape (width, height)
    
    Returns:
    a grayscale, resized, and flattened image
    """
    # Read image as grayscale
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    
    # Resize image
    if image_dim is not None:
        img = cv2.resize(img, image_dim, interpolation = cv2.INTER_AREA) 
    
    img = img.astype(np.float32) / 255.0
    img = 1.0 - img
    img = img.reshape(-1)
    return img

In [6]:
def split_train_test_files(path, train_path, test_path, train_ratio=0.8):
    """
    Takes a path to a folder and splits the files into train and test folders in 80:20 ratio.
    
    Args:
    * path: Full path to folder
    * train_path: Full path to train folder
    * test_path: Full path to test folder
    * train_ratio: Ratio of train to test files
    
    Returns:
    None
    """
    # Get all alphabet folders
    alphabet_folders = [os.path.join(path, folder)
                        for folder in os.listdir(path)
                            if os.path.isdir(os.path.join(path, folder))]

    # Get all character folders
    character_folders = [os.path.join(alphabet_path, character)
                         for alphabet_path in alphabet_folders
                             for character in os.listdir(alphabet_path)
                                 if os.path.isdir(os.path.join(alphabet_path, character))]

    # Sort alphabetically 
    character_folders = sorted(character_folders)
    
    # Create train and test folders
    if not os.path.exists(train_path):
        os.makedirs(train_path)
    if not os.path.exists(test_path):
        os.makedirs(test_path)
    
    # Loop through each character folder
    for character_folder in character_folders:
        # Get alphabet name and character number
        alphabet_name = character_folder.split('/')[-2]
        character_number = character_folder.split('/')[-1]
        
        # Create alphabet folder in train and test folders
        train_alphabet_folder = os.path.join(train_path, alphabet_name)
        test_alphabet_folder = os.path.join(test_path, alphabet_name)
        if not os.path.exists(train_alphabet_folder):
            os.makedirs(train_alphabet_folder)
        if not os.path.exists(test_alphabet_folder):
            os.makedirs(test_alphabet_folder)
        
        # Create character folder in train and test folders
        train_character_folder = os.path.join(train_alphabet_folder, character_number)
        test_character_folder = os.path.join(test_alphabet_folder, character_number)
        if not os.path.exists(train_character_folder):
            os.makedirs(train_character_folder)
        if not os.path.exists(test_character_folder):
            os.makedirs(test_character_folder)
        
        # Get all image files
        files = os.listdir(character_folder)
        
        # Randomly shuffle files
        random.seed(42)
        random.shuffle(files)
        
        # Split files into train and test
        train_files = files[:int(len(files)*train_ratio)]
        test_files = files[int(len(files)*train_ratio):]
        
        # Copy each file into train and test folders
        for file in train_files:
            shutil.copy(os.path.join(character_folder, file), train_character_folder)
        for file in test_files:
            shutil.copy(os.path.join(character_folder, file), test_character_folder)


In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define paths to train and test folders
train_folder = '/Users/preethi/MS DS/DeepLearning/Final Project/train_folder1'
test_folder = '/Users/preethi/MS DS/DeepLearning/Final Project/test_folder1'

# Define image size and batch size
img_height, img_width = 28, 28
batch_size = 32

# Use ImageDataGenerator to preprocess images
train_datagen = ImageDataGenerator(
    rescale=1./255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True)
test_datagen = ImageDataGenerator(rescale=1./255)

# Create data generators for train and test sets
train_generator = train_datagen.flow_from_directory(
    train_folder,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    color_mode='grayscale',
    class_mode='categorical')
test_generator = test_datagen.flow_from_directory(
    test_folder,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    color_mode='grayscale',
    class_mode='categorical')

# Define the CNN model
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(img_height, img_width, 1)),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(train_generator.num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_generator, epochs=50)

# Evaluate the model on the test set
model.evaluate(test_generator)


In [None]:
test_loss, test_accuracy = model.evaluate(test_generator)
print("Test accuracy: {:.2f}%".format(test_accuracy * 100))

In [8]:
import pandas as pd
import os

train_folder = '/Users/preethi/MS DS/DeepLearning/Final Project/train_folder'
test_folder = '/Users/preethi/MS DS/DeepLearning/Final Project/test_folder'

train_data = []
test_data = []

# Loop over the alphabet folders
for alphabet_folder in os.listdir(train_folder):
    if not alphabet_folder.startswith('.'):
        # Loop over the character folders
        for character_folder in os.listdir(os.path.join(train_folder, alphabet_folder)):
            if not character_folder.startswith('.'):
                # Loop over the images
                for image_file in os.listdir(os.path.join(train_folder, alphabet_folder, character_folder)):
                    if not image_file.startswith('.'):
                        # Append the file path and label to the train data list
                        train_data.append([os.path.join(train_folder, alphabet_folder, character_folder, image_file), alphabet_folder, character_folder])

# Loop over the alphabet folders
for alphabet_folder in os.listdir(test_folder):
    if not alphabet_folder.startswith('.'):
        # Loop over the character folders
        for character_folder in os.listdir(os.path.join(test_folder, alphabet_folder)):
            if not character_folder.startswith('.'):
                # Loop over the images
                for image_file in os.listdir(os.path.join(test_folder, alphabet_folder, character_folder)):
                    if not image_file.startswith('.'):
                        # Append the file path and label to the test data list
                        test_data.append([os.path.join(test_folder, alphabet_folder, character_folder, image_file), alphabet_folder, character_folder])

# Create pandas dataframes for the train and test data
train_df = pd.DataFrame(train_data, columns=['filepath', 'language', 'character'])
test_df = pd.DataFrame(test_data, columns=['filepath', 'language', 'character'])

# Combine the language and character columns to form the label column
train_df['label'] = train_df['language'] + '_' + train_df['character']
test_df['label'] = test_df['language'] + '_' + test_df['character']

# Print the train and test dataframes
print('Train dataframe:')
print(train_df.head())
print('\nTest dataframe:')
print(test_df.head())

# create a csv file for train and test data
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)


Train dataframe:
                                            filepath  language    character   
0  /Users/preethi/MS DS/DeepLearning/Final Projec...  Gujarati  character42  \
1  /Users/preethi/MS DS/DeepLearning/Final Projec...  Gujarati  character42   
2  /Users/preethi/MS DS/DeepLearning/Final Projec...  Gujarati  character42   
3  /Users/preethi/MS DS/DeepLearning/Final Projec...  Gujarati  character42   
4  /Users/preethi/MS DS/DeepLearning/Final Projec...  Gujarati  character42   

                  label  
0  Gujarati_character42  
1  Gujarati_character42  
2  Gujarati_character42  
3  Gujarati_character42  
4  Gujarati_character42  

Test dataframe:
                                            filepath  language    character   
0  /Users/preethi/MS DS/DeepLearning/Final Projec...  Gujarati  character42  \
1  /Users/preethi/MS DS/DeepLearning/Final Projec...  Gujarati  character42   
2  /Users/preethi/MS DS/DeepLearning/Final Projec...  Gujarati  character42   
3  /Users/preethi/M

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten


In [5]:
# train_df = pd.DataFrame(train_data, columns=['filepath', 'label'])
# test_df = pd.DataFrame(test_data, columns=['filepath', 'label'])

# Preprocessing the data
train_data = []
test_data = []

for i in range(len(train_df)):
    img = cv2.imread(train_df['filepath'][i])
    img = cv2.resize(img, (64, 64))
    train_data.append([img, train_df['label'][i]])

for i in range(len(test_df)):
    img = cv2.imread(test_df['filepath'][i])
    img = cv2.resize(img, (64, 64))
    test_data.append([img, test_df['label'][i]])
# print(train_data[0:10])
# Converting the data to numpy arrays
train_data = np.array(train_data)
test_data = np.array(test_data)

# Splitting the data into X_train, y_train, X_test, y_test
X_train = np.array([i[0] for i in train_data])
y_train = np.array([i[1] for i in train_data])
X_test = np.array([i[0] for i in test_data])
y_test = np.array([i[1] for i in test_data])

# Normalizing the data
X_train = X_train / 255.0
X_test = X_test / 255.0


  train_data = np.array(train_data)
  test_data = np.array(test_data)


In [7]:
from sklearn.preprocessing import LabelEncoder

# Convert categorical labels to numerical values
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['label'])
y_test = label_encoder.transform(test_df['label'])
num_classes = len(label_encoder.classes_)

# Define the CNN model
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Convert labels to one-hot encoding
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model on train and test data
train_loss, train_acc = model.evaluate(X_train, y_train, verbose=0)
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)

print('Train accuracy:', train_acc)
print('Test accuracy:', test_acc)


Epoch 1/10


2023-04-21 17:11:28.902314: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train accuracy: 0.9250364899635315
Test accuracy: 0.5601491928100586


In [8]:
from sklearn.preprocessing import LabelEncoder

# Convert categorical labels to numerical values
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['label'])
y_test = label_encoder.transform(test_df['label'])
num_classes = len(label_encoder.classes_)

# Define the CNN model
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model with sparse categorical crossentropy loss
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model on train and test data
train_loss, train_acc = model.evaluate(X_train, y_train, verbose=0)
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)

print('Train accuracy:', train_acc)
print('Test accuracy:', test_acc)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train accuracy: 0.9186967015266418
Test accuracy: 0.5640348196029663
