In [None]:
!unzip /content/drive/MyDrive/INFO371/language-text-images/train.zip

In [None]:
!unzip /content/drive/MyDrive/INFO371/language-text-images/validation.zip

In [23]:
import numpy as np
import pandas as pd
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
print("tensorflow version", tf.__version__)
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
import sys

## Define image properties:
imgDir = "/content"
targetWidth, targetHeight = 25, 25
# targetWidth, targetHeight = 50, 50
# targetWidth, targetHeight = 100, 100
imageSize = (targetWidth, targetHeight)
channels = 1  # color channels

## define other constants, including command line argument defaults
epochs = 10
plot = False  # show plots?

## command line arguments
# check if this was run as a separate file (not inside notebook)
import __main__ as main
if hasattr(main, "__file__"):
    # run as file
    print("parsing command line arguments")
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--dir", "-d",
                        help = "directory to read images from",
                        default = imgDir)
    parser.add_argument("--epochs", "-e",
                        help = "how many epochs",
                        default= epochs)
    parser.add_argument("--plot", "-p",
                        action = "store_true",
                        help = "plot a few wrong/correct results")
    args = parser.parse_args()
    imgDir = args.dir
    epochs = int(args.epochs)
    plot = args.plot
else:
    # run as notebook
    print("run interactively from", os.getcwd())
    imageDir = os.path.join(os.path.expanduser("~"),
                            "data", "images", "text", "language-text-images")
print("Load images from", imgDir)
print("epochs:", epochs)

## Prepare dataset for training model:
filenames = os.listdir(os.path.join(imgDir, "train"))
print(len(filenames), "images found")
trainingResults = pd.DataFrame({
    'filename':filenames,
    'category':pd.Series(filenames).str[-10:-8]
})

# Filtering 3 Languages (training)
#trainingResults = trainingResults[(trainingResults.category == 'EN') | (trainingResults.category == 'ZN')|(trainingResults.category == 'TH')]

# Using a sample of the training data rather than the whole thing
# trainingResults = trainingResults.sample(n=10000, random_state = 1)

print("data files:")
print(trainingResults.sample(5))
nCategories = trainingResults.category.nunique()
print("categories:\n", trainingResults.category.value_counts())
## Create model
from tensorflow.keras import initializers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D,\
    MaxPooling2D, AveragePooling2D,\
    Dropout,Flatten,Dense,Activation,\
    BatchNormalization

model = Sequential()
## First convolutional layer with 32 filters (kernels)
model.add(Conv2D(32,
                 kernel_size=3,
                 strides=1,
                 activation='relu',
                 input_shape=(targetWidth, targetHeight, channels)))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.25))

## 2nd convolutional layer
model.add(Conv2D(64,
                 kernel_size = 3,
                 activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.25))

## 3rd convolutional layer
model.add(Conv2D(128,
                 kernel_size=3,
                 activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.25))

# model.add(Conv2D(256,
#                  kernel_size=3,
#                  activation='relu'))
# model.add(BatchNormalization())
# model.add(MaxPooling2D(pool_size=2))
# model.add(Dropout(0.25))

## Flatten the image into a string of pixels
model.add(Flatten())

## Use one final dense layer
model.add(Dense(512, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

## Output layer with 2 softmax nodes
model.add(Dense(5, activation='softmax'))

model.add(Dense(nCategories,
                kernel_initializer = initializers.HeNormal(),
                activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
model.summary()

## Training and validation data generator:
trainingGenerator = ImageDataGenerator(
    rotation_range=15,
    rescale=1./255,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    width_shift_range=0.1,
    height_shift_range=0.1
).\
    flow_from_dataframe(trainingResults,
                        os.path.join(imgDir, "train"),
                        x_col='filename', y_col='category',
                        target_size=imageSize,
                        class_mode='categorical',
                        color_mode="grayscale",
                        shuffle=True)
label_map = trainingGenerator.class_indices
## Model Training:
history = model.fit(
    trainingGenerator,
    epochs=epochs
)

## Validation data preparation:
validationDir = os.path.join(imgDir, "validation")
fNames = os.listdir(validationDir)
print(len(fNames), "validation images")
validationResults = pd.DataFrame({
    'filename': fNames,
    'category': pd.Series(fNames).str[-10:-8]
})

# Filtering 3 Languages (validation)
# validationResults = validationResults[(validationResults.category == 'EN') | (validationResults.category == 'ZN')|(validationResults.category == 'TH')]

print(validationResults.shape[0], "validation files read from", validationDir)
validationGenerator = ImageDataGenerator(rescale=1./255).\
    flow_from_dataframe(validationResults,
                        os.path.join(imgDir, "validation"),
                        x_col='filename',
                        class_mode = None,
                        target_size = imageSize,
                        shuffle = False,
                        # do _not_ randomize the order!
                        # this would clash with the file name order!
                        color_mode="grayscale"
    )

## Make categorical prediction:
print(" --- Predicting on validation data ---")
phat = model.predict(validationGenerator)
print("Predicted probability array shape:", phat.shape)
print("Example:\n", phat[:5])

## Convert labels to categories:
validationResults['predicted'] = pd.Series(np.argmax(phat, axis=-1), index=validationResults.index)
print(validationResults.head())
labelMap = {v: k for k, v in label_map.items()}
validationResults["predicted"] = validationResults.predicted.replace(labelMap)
print("confusion matrix (validation)")
print(pd.crosstab(validationResults.category, validationResults.predicted))
print("Validation accuracy", np.mean(validationResults.category == validationResults.predicted))

## Print and plot misclassified results
wrongResults = validationResults[validationResults.predicted != validationResults.category]
rows = np.random.choice(wrongResults.index, min(4, wrongResults.shape[0]), replace=False)
print("Example wrong results (validation data)")
print(wrongResults.sample(min(10, wrongResults.shape[0])))
if plot:
    plt.figure(figsize=(12, 12))
    index = 1
    for row in rows:
        filename = wrongResults.loc[row, 'filename']
        predicted = wrongResults.loc[row, 'predicted']
        img = load_img(os.path.join(imgDir, "validation", filename), target_size=imageSize)
        plt.subplot(4, 2, index)
        plt.imshow(img)
        plt.xlabel(filename + " ({})".format(predicted))
        index += 1
    # now show correct results
    index = 5
    correctResults = validationResults[validationResults.predicted == validationResults.category]
    rows = np.random.choice(correctResults.index,
                            min(4, correctResults.shape[0]), replace=False)
    for row in rows:
        filename = correctResults.loc[row, 'filename']
        predicted = correctResults.loc[row, 'predicted']
        img = load_img(os.path.join(imgDir, "validation", filename), target_size=imageSize)
        plt.subplot(4, 2, index)
        plt.imshow(img)
        plt.xlabel(filename + " ({})".format(predicted))
        index += 1
    plt.tight_layout()
    plt.show()

## Training data predictions.
## Do these here to keep the in place for students
## 
print(" --- Predicting on training data: ---")
# do another generator: the same as training, just w/o shuffle
predictTrainGenerator = ImageDataGenerator(rescale=1./255).\
    flow_from_dataframe(trainingResults,
                        os.path.join(imgDir, "train"),
                        x_col='filename', y_col='category',
                        target_size=imageSize,
                        class_mode='categorical',
                        color_mode="grayscale",
                        shuffle=False  # do not shuffle!
    )
phat = model.predict(predictTrainGenerator)
trainingResults['predicted'] = pd.Series(np.argmax(phat, axis=-1), index=trainingResults.index)
trainingResults["predicted"] = trainingResults.predicted.replace(labelMap)
print("confusion matrix (training)")
print(pd.crosstab(trainingResults.category, trainingResults.predicted))
print("Train accuracy", np.mean(trainingResults.category == trainingResults.predicted))

tensorflow version 2.9.1
run interactively from C:\Users\pauls\Desktop\PS7
Load images from language
epochs: 10
31869 images found
data files:
                               filename category
9530                 dracula_EN-bdc.jpg       EN
17477            novel_00010_TH-abj.jpg       TH
22994            novel_00082_TH-aao.jpg       TH
29656  tolstoy-voina-i-mir-3_RU-arl.jpg       RU
28101  tolstoy-voina-i-mir-2_RU-bfh.jpg       RU
categories:
 EN    8442
TH    7425
RU    6511
ZN    5396
DA    4095
Name: category, dtype: int64
Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_51 (Conv2D)          (None, 23, 23, 32)        320       
                                                                 
 batch_normalization_68 (Bat  (None, 23, 23, 32)       128       
 chNormalization)                                                
                                               