### Framework imports

In [1]:
from noronha.tools.publish import Publisher
from noronha.tools.shortcuts import data_path, tmp_path

### Application imports

In [2]:
import numpy as np 
import pandas as pd
import joblib
from io import StringIO
from sklearn.model_selection import train_test_split,StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.svm import SVC

### Training parameters

In [3]:
gamma = 0.1
kernel = 'linear'
n_folds = 3

In [4]:
params = {"gamma": gamma, "kernel": kernel, "n_folds": n_folds}

### Loading the dataset

In [None]:
df = pd.read_csv('datasets/atis_intents.csv')
df.columns = ['label','querys']

### Data processing

In [None]:
# maping the category class with numerical id
col = ['label', 'querys']
df = df[col]
df = df[pd.notnull(df['querys'])]
df.columns = ['label', 'querys']
df['category_id'] = df['label'].factorize()[0]

# creating a dictionary for the previous map
category_id_df = df[['label', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'label']].values)

### Vectorizing

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.querys).toarray()
labels = df.category_id

### Training split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['querys'], df['label'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

### Creating the estimator and folding strategy

In [None]:
clf = SVC(kernel=kernel, gamma=gamma)
fold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=np.random.RandomState(19))

### Training

In [None]:
score = cross_val_score(estimator=clf, X=X_train_tfidf, y=y_train, cv=fold, n_jobs=-1)  # calculating metrics

metrics = dict(
    score_avg=score.mean(),
    score_var=np.sqrt(score.var())
)

clf.fit(X_train_tfidf, y_train) # actual training

### Model publish

In [None]:
joblib.dump(clf, tmp_path('clf.pkl'))

Publisher()(
    details=dict(
        params=params,
        metrics=metrics
    )
)