## Data

In [1]:
from src.language_classifier import LanguageClassifier
from collections import Counter

classifier = LanguageClassifier('./data', 0.1, 0)

print(f"Train set size: {len(classifier.train_data_langs)}")
counts = Counter([c['class'] for c in classifier.train_data_langs])
print(f"Train set classes: {counts}")

print(f"Test set size: {len(classifier.test_data_langs)}")
counts = Counter([c['class'] for c in classifier.test_data_langs])
print(f"Test set classes: {counts}")

Train set size: 24
Train set classes: Counter({'ANGIELSKI': 8, 'DUŃSKI': 8, 'NIEMIECKI': 8})
Test set size: 6
Test set classes: Counter({'ANGIELSKI': 2, 'DUŃSKI': 2, 'NIEMIECKI': 2})


## Training

In [2]:
max_epochs = 100
min_accuracy = 95
epoch = 1
accuracy = 0

while epoch <= max_epochs and accuracy < min_accuracy:
    print(f"[[[ EPOCH {epoch} ]]]")
    print("Training...")
    classifier.learn_once()
    print("Testing...")
    test_result = classifier.test_once()

    count_correct = 0
    for test in test_result:
        is_correct = (test['class'] == test['prediction'])
        print(f"[{'CORRECT' if is_correct else 'INCORRECT'}] Testing for {test['class']}; prediction is {test['prediction']}, in '{test['name']}'")
        count_correct += 1 if is_correct else 0

    accuracy = 100 * count_correct / len(test_result)
    print(f"[ACCURACY]: {accuracy}\n")
    epoch += 1

[[[ EPOCH 1 ]]]
Training...
Testing...
[INCORRECT] Testing for ANGIELSKI; prediction is NIEMIECKI, in 'Stylist (magazine)'
[INCORRECT] Testing for DUŃSKI; prediction is NIEMIECKI, in 'Patent'
[CORRECT] Testing for NIEMIECKI; prediction is NIEMIECKI, in 'Margarita Nelken'
[INCORRECT] Testing for ANGIELSKI; prediction is NIEMIECKI, in 'John Horton Conway'
[INCORRECT] Testing for DUŃSKI; prediction is NIEMIECKI, in 'Bakterier'
[CORRECT] Testing for NIEMIECKI; prediction is NIEMIECKI, in 'Auszeichnung'
[ACCURACY]: 33.333333333333336

[[[ EPOCH 2 ]]]
Training...
Testing...
[CORRECT] Testing for ANGIELSKI; prediction is ANGIELSKI, in 'Stylist (magazine)'
[INCORRECT] Testing for DUŃSKI; prediction is NIEMIECKI, in 'Patent'
[CORRECT] Testing for NIEMIECKI; prediction is NIEMIECKI, in 'Margarita Nelken'
[CORRECT] Testing for ANGIELSKI; prediction is ANGIELSKI, in 'John Horton Conway'
[INCORRECT] Testing for DUŃSKI; prediction is NIEMIECKI, in 'Bakterier'
[CORRECT] Testing for NIEMIECKI; predict

## User input

In [3]:
text = input("Text to classify: ")
import textwrap
text = textwrap.fill(text, width=100, subsequent_indent='\t')
print(f"[TEXT]: {text}\n")
print(f"[CLASSIFICATION]: {classifier.predict_class_of_text(text)}\n")

[TEXT]: Perlin noise is a type of gradient noise developed by Ken Perlin in 1983. It has many uses,
	including but not limited to: procedurally generating terrain, applying pseudo-random changes to a
	variable, and assisting in the creation of image textures. It is most commonly implemented in two,
	three, or four dimensions, but can be defined for any number of dimensions. History  Ken Perlin
	developed Perlin noise in 1983 as a result of his frustration with the "machine-like" look of
	computer-generated imagery (CGI) at the time.[1] He formally described his findings in a SIGGRAPH
	paper in 1985 called "An Image Synthesizer".[2] He developed it after working on Disney's computer
	animated sci-fi motion picture Tron (1982) for the animation company Mathematical Applications
	Group (MAGI).[3] In 1997, Perlin was awarded an Academy Award for Technical Achievement for
	creating the algorithm, the citation for which read:[4][5][6][7]      To Ken Perlin for the
	development of Perlin Nois