In [2]:
import spacy 
import spacy.lang.es as es 
import spacy.lang.en as en
nlp = spacy.load('es_core_news_sm')

# Esta parte es solo de practica

In [3]:
import numpy as np

In [4]:
stopwords_es = list(es.STOP_WORDS)
stopwords_en = list(en.STOP_WORDS)

In [5]:
docx = nlp("Hola, mi amigo se llama David, el estaba corriendo detrás del estadio")

In [6]:
for word in docx:
    print(word.text, "Lemma ", word.lemma_)

Hola Lemma  Hola
, Lemma  ,
mi Lemma  mi
amigo Lemma  amigar
se Lemma  se
llama Lemma  llamar
David Lemma  David
, Lemma  ,
el Lemma  el
estaba Lemma  estar
corriendo Lemma  correr
detrás Lemma  detrás
del Lemma  del
estadio Lemma  estadio


In [7]:
for word in docx:
    if word.lemma_ != "-PRON-":
        print(word.lemma_.lower().strip())

hola
,
mi
amigar
se
llamar
david
,
el
estar
correr
detrás
del
estadio


In [8]:
[word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in docx]

['hola',
 ',',
 'mi',
 'amigar',
 'se',
 'llamar',
 'david',
 ',',
 'el',
 'estar',
 'correr',
 'detrás',
 'del',
 'estadio']

In [9]:
for word in docx:
    if not word.is_stop and not word.is_punct:
        print(word)

Hola
amigo
llama
David
corriendo
estadio


In [10]:
[word for word in docx if not word.is_stop and not word.is_punct]

[Hola, amigo, llama, David, corriendo, estadio]

# Parte Funcional

### Funcion que carga el dataset

In [11]:
import pickle
with open("dataSet.txt", "rb") as fp:   # Unpickling
    dataset = np.asarray(pickle.load(fp)) 
  

In [12]:
import string
punctuations = string.punctuation

In [13]:
from spacy.lang.es import Spanish
from spacy.lang.en import English
parser = Spanish()
#parser = English()

In [14]:
def quitarSimbolos(listText):
    SYMBOLS = '{}()[].,:;+-*/#%\&|<>=~$1234567890'
    for index in range(len(listText)):
        for symbol in SYMBOLS:
            listText[index] = listText[index].replace(symbol, "")
        
    newList = [item.translate(SYMBOLS).strip() for item in listText]
    return newList

def quitarTildes(text):
    text = text.replace("á", "a")
    text = text.replace("é", "e")
    text = text.replace("í", "i")
    text = text.replace("ó", "o")
    text = text.replace("ú", "u")
    text = text.replace("ñ", "n")
    return text

def tokenizador(text):  
    text = quitarTildes(text) #Solo espaniol
    text = text.replace("\r", " ")
    text = text.replace("\n", " ")
    text = text.replace("    ", " ")
    text = text.replace('"', '')
    text = text.replace("''", '')
    text = text.replace("'s", "")
    tokens = parser(text)
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
    quitarSimbolos(tokens)
    #tokens  = [t for t in tokens if not isinstance(t, str)]
    tokens = [word for word in tokens if word not in stopwords_es and word not in punctuations]
    text = " ".join(tokens)
    return text

#tokenizador("Hi, my name is David, I like to play football")

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [16]:
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300
#vectorizer = CountVectorizer(tokenizer = tokenizador, ngram_range = (1, 1))
#classifier = LinearSVC(max_iter=1000)
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)

In [31]:
from sklearn.model_selection import train_test_split
X = dataset[:,0]
Y = dataset[:, 1]


X = [tokenizador(x) for x in X]

Y = [int(y) for y in Y]

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size = 0.15, random_state = 8)

In [30]:
with open("dataX.txt", "wb") as fp:   #Pickling
    pickle.dump(X, fp)
with open("dataY.txt", "wb") as fp:   #Pickling
    pickle.dump(Y, fp)

In [32]:
features_train = tfidf.fit_transform(Xtrain).toarray()
labels_train =Ytrain
print(features_train.shape)

features_test = tfidf.transform(Xtest).toarray()
labels_test = Ytest
print(features_test.shape)

(1887, 300)
(334, 300)


In [33]:
import pickle

pickle.dump(tfidf, open("tfidf.pickle", "wb"))

In [20]:
from sklearn.feature_selection import chi2
import numpy as np
category_codes = {
    "sport" : 0,
    "entertainment": 1,
    "business" : 2,
    "politic": 3,
    "tech": 4
}
print(category_codes.items())
for Product, category_id in sorted(category_codes.items()):
    features_chi2 = chi2(features_train, np.where(np.asarray(labels_train) == category_id, 1, 0))
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' category:".format(Product))
    print("  . Palabras mas correlacionadas:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Dos palabras mas correlacionadas:\n. {}".format('\n. '.join(bigrams[-2:])))
    print("")

dict_items([('sport', 0), ('entertainment', 1), ('business', 2), ('politic', 3), ('tech', 4)])
# 'business' category:
  . Palabras mas correlacionadas:
. economia
. banco
. accionar
. crecimiento
. bn
  . Dos palabras mas correlacionadas:
. ano edad
. mil millón

# 'entertainment' category:
  . Palabras mas correlacionadas:
. cine
. estrellar
. premio
. actor
. pelicula
  . Dos palabras mas correlacionadas:
. reino unido
. mil millón

# 'politic' category:
  . Palabras mas correlacionadas:
. elección
. liberal
. ministro
. blair
. conservador
  . Dos palabras mas correlacionadas:
. ano edad
. reino unido

# 'sport' category:
  . Palabras mas correlacionadas:
. temporada
. equipar
. jugar
. jugador
. victoria
  . Dos palabras mas correlacionadas:
. reino unido
. ano edad

# 'tech' category:
  . Palabras mas correlacionadas:
. red
. ordenador
. software
. usuario
. tecnologia
  . Dos palabras mas correlacionadas:
. ano pasar
. ano edad



In [21]:
from sklearn.svm import SVC
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
C = [.0001, .001, .01, .1]
degree = [3, 4, 5]
gamma = [1, 10, 100]
probability = [True]

param_grid = [
  {'C': C, 'kernel':['linear'], 'probability':probability},
  {'C': C, 'kernel':['poly'], 'degree':degree, 'probability':probability},
  {'C': C, 'kernel':['rbf'], 'gamma':gamma, 'probability':probability}
]

# Create a base model
svc = SVC(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=svc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(features_train, labels_train)

Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed: 12.1min finished


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.33, train_size=None),
             error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=8, shrinking=True,
                           tol=0.001, verbose=False),
             iid='wa...one,
             param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1], 'kernel': ['linear'],
                          'probability': [True]},
                         {'C': [0.0001, 0.001, 0.01, 0.1], 'degree': [3, 4, 5],
                          'kernel': ['poly'], 'probability': [True]},
                         {'C': [0.0001, 0.001, 0.01, 0.1],
                          'gamma': [1, 10, 100], 'kernel': ['rbf'],
                          'probability': [True]}],
             pre_

In [22]:
print("Mejores parametros para el modelo luego de la busqueda:")
print(grid_search.best_params_)
print("")
print("Acc promedio del mejor modelo:")
print(grid_search.best_score_)

Mejores parametros para el modelo luego de la busqueda:
{'C': 0.1, 'kernel': 'linear', 'probability': True}

Acc promedio del mejor modelo:
0.9406099518459069


In [23]:
best_svc = grid_search.best_estimator_
best_svc.fit(features_train, labels_train)
svc_pred = best_svc.predict(features_test)

In [24]:


# Training accuracy
print("Acc_train: ")
print(accuracy_score(labels_train, best_svc.predict(features_train)))



Acc_train: 
0.9581346051934287


In [25]:
# Test accuracy
print("Acc_test ")
print(accuracy_score(labels_test, svc_pred))

Acc_test 
0.9401197604790419


In [28]:
from joblib import dump, load
#dump(best_svc, 'modeloSpanish.joblib')

In [29]:
clf = load('modeloSpanish.joblib')
print(accuracy_score(labels_test, clf.predict(features_test)))

0.9401197604790419
