In [7]:
# The following European Classification of Individual Consumption according to Purpose (ECOICOP) data 
# used for this example is open-source, provided by Statistics Poland

# Link: https://github.com/UNECE/ML_dataset

In [8]:
import fasttext
import os
import pickle
from pathlib import Path

In [9]:
### Define file locations ###

data_dir = "C:\\Users\\Justin Evans\\Documents\\Python\\UNECE\\Poland_FastText\\"
model_dir = "C:\\Users\\Justin Evans\\Documents\\Python\\UNECE\\Poland_FastText\\"
args_file = Path(model_dir + "args.txt")


In [10]:
# Define the default parameters
model_name = "model.bin"

# set up default parameters (if hyperparameter step was not used):
epochs = 20
learning_rate = 0.7
dimensions = 60 # original 500
minimum_word_count = 6
word_ngrams = 6
min_char_grams =4
max_char_grams =5

In [11]:
train_data = os.path.join(os.getenv("DATADIR",""),model_dir+"train.txt")
# test_data = os.path.join(os.getenv("DATADIR",""),model_dir+"test_naics.txt")

In [12]:
# if our hyperparameter tune file exists we will use the best result, else run our default parameters
if args_file.is_file():
    
    # load the arguements file created during hyperparameter tuning
    with open("args.txt", "rb") as file:
        args = pickle.load(file) 
    
    model = fasttext.train_supervised(input=train_data, epoch=int(args['epochs']), lr=float(args['learning_rate']),
                                  dim=int(args['dimensions']), minCount=int(args['minimum_word_count']),
                                  wordNgrams=int(args['word_ngrams']), minn=int(args['min_char_grams']),
                                  maxn=int(args['max_char_grams']))
    print("Created model based on hyperparameter results")

else:
    model = fasttext.train_supervised(input=train_data,epoch=epochs,lr=learning_rate,dim=dimensions,minCount=minimum_word_count,
                                  wordNgrams=word_ngrams,minn=min_char_grams,maxn=max_char_grams)
    
    # produce a report with model info
    report = open(model_dir+"Model_Parameters.txt","w")
    lines = ["Epochs: "+str(epochs)+"\n","Learning Rate: "+ str(learning_rate)+"\n",
            "Dimensions: "+str(dimensions)+"\n", "Minimum Word Count: "+str(minimum_word_count)+"\n",
             "Word NGrams: " +str(word_ngrams)+"\n", 
             "Min Char grams: " +str(min_char_grams)+"\n", "Max Char Grams: " +str(max_char_grams)+"\n"]
    report.writelines(lines) 
    report.close()
    print("Created model based on default parameters")


Created model based on hyperparameter results


In [13]:
model.save_model(model_dir+model_name)

In [14]:
print("model is saved!")

model is saved!
