In [32]:
import pandas as pd
import re
import string
import nltk
import os
import pickle
import pytesseract

from PIL import Image
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV

## Upload preprocessed dataset

In [2]:
input_df = pd.read_csv(os.path.join("..", "data", "train_data_complete_fixed.csv"))

## Cleaning data
All to lowercase, remove square brackets, links, punctuation and words with digits within.

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

input_df['text'] = input_df['text'].apply(lambda x: clean_text(x))

## Tokenization

In [4]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
input_df['text'] = input_df['text'].apply(lambda x: tokenizer.tokenize(x))

## Removal of stopwords
Applicable only to English files (no dedicated package for Polish)

In [5]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

input_df['text'] = input_df['text'].apply(lambda x: remove_stopwords(x))

## Combining lists to strings for vectorization

In [6]:
def combine_text(list_of_text):
    combined_text = ' '.join(list_of_text)
    return combined_text

input_df['text'] = input_df['text'].apply(lambda x: combine_text(x))

## Training of TF-IDF Vectorizer

In [7]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))

tfidf_fitted = tfidf.fit(input_df['text'])

train_tfidf = tfidf_fitted.transform(input_df['text'])

## Saving trained vectorizer to pickle file

In [8]:
path_model = os.path.join("..", "models", "vectorizer.pkl")
with open(path_model, 'wb') as file:
    pickle.dump(tfidf_fitted, file)


## Saving vectorized training data to pickle file

In [17]:
path_data = os.path.join("..", "data", "vectorized_train_data.pkl")
with open(path_data, 'wb') as file:
    pickle.dump(tfidf_fitted, file)


## Test-train split for training data

In [20]:
X_train, X_test, y_train, y_test = train_test_split(train_tfidf, input_df['target'], test_size=0.2, random_state=42)

## Model selection - Logistic Regression
Hyperparameters set by RandomizedSearchCV as shown below.

In [None]:
param_grid = {
    'C': [0.1, 1.0, 10.0],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Regularization type
    'solver': ['liblinear', 'saga'],  # Optimization algorithm
    'max_iter': [100, 200, 300]  # Maximum number of iterations
}

model = LogisticRegression()
random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=10, cv=5)
random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_
best_params = random_search.best_params_

print("Best model: ", best_model)
print("Best params: ", best_params)

In [23]:
model = LogisticRegression(C=10.0, max_iter=300, solver='liblinear', penalty='l2')
model.fit(X_train, y_train)

## Prediction on training data

In [27]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

                                            precision    recall  f1-score   support

                             advertisement       0.62      0.57      0.60       107
                                    budget       0.63      0.52      0.57       118
                                     email       0.90      0.76      0.82       116
                               file_folder       0.32      0.80      0.46        97
                               form_folder       0.67      0.53      0.60       105
                               handwritten       0.44      0.60      0.51       111
                                   invoice       0.74      0.69      0.71        99
                                    letter       0.65      0.59      0.62       120
                                      memo       0.64      0.59      0.62       110
                              news_article       0.57      0.53      0.55        89
                                  pit37_v1       0.99      0.96      0.97  

## Saving trained model to pickle file

In [29]:
path_for_model = os.path.join("..", "models", "log_reg_model.pkl")
with open(path_for_model, 'wb') as file:
    pickle.dump(model, file)

## Upload a scan file selected by user

In [31]:
file_path = input("Upload file to classify: ")

try:
    image = Image.open(file_path)
except Exception as e:
    print("Error while reading the file:", str(e))

## Reading file by OCR Tesseract

In [33]:
ocr_result = pytesseract.image_to_string(image, lang='eng+pol')

## Vectorization and prediction

In [44]:
vect_text = tfidf_fitted.transform([ocr_result])
prediction = model.predict(vect_text)

## Result and open predicted file for verification

In [45]:
print("Predicted class for this document: ", prediction)
image.show()

Predicted class for this document:  ['budget']


# Convolutional Neural Network: PyTorch

## Defining two-layered CNN

In [None]:
import torch.nn as nn
import torch.nn.functional as F


class MyCNN(nn.Module):
    num_classes = 21

    def __init__(self):
        super().__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 16, 5),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2))

        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, 5),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2))

        self.fc = nn.Linear(119072, self.num_classes)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)

        return out