In [112]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
import joblib
import numpy as np


# 3. Carregar Dados:
df = pd.read_csv('/home/rafael/trabalho/detector_idioma/data_gold.csv', sep=';')

# 4. Dividir Dados:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# 5. Pré-processamento de Texto e Treinamento do Modelo:
pipeline = make_pipeline(CountVectorizer(analyzer ='char'), MultinomialNB())


# Definindo o espaço de parâmetros para o Grid Search
param_grid = {
    'countvectorizer__ngram_range': [(1, 1), (1, 2), (1,3), (2,3)],
    'multinomialnb__alpha': [0.1, 0.5, 1.0],
}

# Criando o objeto GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Executando a pesquisa em grade
grid_search.fit(df['text'], df['label'])

# Melhores parâmetros encontrados
best_params = grid_search.best_params_
print(f'Melhores parâmetros: {best_params}')

# 7. Avaliar Modelo:
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia no conjunto de teste: {accuracy}')

# 8. Salvamento do Modelo:
joblib.dump(grid_search, 'model.joblib')

Melhores parâmetros: {'countvectorizer__ngram_range': (2, 3), 'multinomialnb__alpha': 0.1}
Acurácia no conjunto de teste: 0.9814285714285714


['model.joblib']

In [8]:
from LanguageDetector.language_detector.language_detector import LanguageDetector


detector = LanguageDetector()

result = detector.detect_language('bom dia hello world capaz não é possivel')
print(result)


{'label': 'pt', 'score': 1.0}
