In [43]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import random

In [44]:
archivo = 'libro.json'   ### cargar el archivo .json 

In [45]:
class Comentario:   ### clases para ver caracteristicas de los comentarios

    def __init__(self, text, score):     ### caracteristicas de los comentarios
        self.text = text
        self.score = score
        self.sentimiento = self.sentimiento()

    def sentimiento(self):   ### clasificacion del sentimiento segun calificacion
        if self.score <=2:
            return 'NEGATIVO'
        #elif self.score == 3:
            #return 'NEUTRAL'
        else:  ## >= 3
            return 'POSITIVO'
        
class Container:

    def __init__(self, comentario):       ### ingresa un conjunto de comentarios
        self.comentario = comentario

    def distribucion(self):               ### ve la cantidad de negativos y positivos en la cadena de comentarios
        negativo = list(filter(lambda x: x.sentimiento == 'NEGATIVO', self.comentario))
        positivo = list(filter(lambda x: x.sentimiento == 'POSITIVO', self.comentario))
        #neutral = list(filter(lambda x: x.sentimiento == 'NEUTRAL', self.comentario))

        positivo_filt = positivo[:len(negativo)]
        self.comentario = negativo + positivo_filt #+ neutral
        random.shuffle(self.comentario)        ### los compentarios se mesclan se con el largo de la cantidad de comentarios negativos para que sea uniforme


        #print(len(negativo))
        #print(len(positivo))
        #print(len(neutral))

    def get_text(self):                            ### toma el texto y el sentimiento para entrenamiento
        return [x.text for x in self.comentario]

    def get_sentiemiento(self):
        return [x.sentimiento for x in self.comentario]

In [46]:
dato = []

with open(archivo) as file:        ### envia todas las lineas de el archivo .json a una tupla 
    for line in file:
        datos = json.loads(line)   ### json.load  para cargar un archivo, json.loads para cargar muchas cadenas 
        dato.append(Comentario(datos['reviewText'], datos['overall']))

      

In [47]:
dato[60].text, dato[60].score, dato[60].sentimiento


("This was a very quick to the point read. It had a lot of plot packed into it. It was pretty fast paced. Definitely not boring & had a bit of a twist. And as of now, I really, really don't like Lilly...I agree with the Krissy K. Even though they are short, the books are well worth .99. And so far the next book comes out pretty quickly. Sooo, I don't mind the stinking cliffhanger as much, lol!",
 4.0,
 'POSITIVO')

In [48]:
len(dato)

10000

In [49]:
train_set, test_set = train_test_split(dato, test_size=0.30)   ### separacion de los datos de entrenamiento y prueba segun la distribucion de la clase Container


train_container = Container(train_set)
test_container = Container(test_set)

train_container.distribucion()
test_container.distribucion()

In [50]:
train_set[60].text, train_set[60].score, train_set[60].sentimiento

('In this book, James Rollins displays his skill at catering to carefree, undiscerning consumers. That is, people who think that if a movie or a book is not saturated with action, it is not worth their time. So, even when the story does not require an action sequence, Rollins inserts one. For discerning readers, the result is poisonous, agonizing pulp.Not only is the action excessive, much of it is implausible. The early boat chase on the Yangtze River? Not only can it be deleted without affecting the story, the end of the chase is silly. It is inappropriate for a serious novel, though fine for a comic book. Later on Rollins inserts a sequence reminiscent of John Wayne\'s prominent Rooster Cogburn sequence in TRUE GRIT. The difference is that Rooster Cogburn\'s is more realistic. And near the end of the book, we have a super-sandstorm that is miraculously cooperative. Its timing is too perfect. It seems to tell the characters, "Tell me when I should arrive, to make your story suspensef

In [51]:
train_X = train_container.get_text()          ##[x.text for x in train_set]
train_y = train_container.get_sentiemiento()  ##[x.sentimiento for x in train_set]
test_X = test_container.get_text()            ##[x.text for x in test_set]
test_y = test_container.get_sentiemiento()    ##[x.sentimiento for x in test_set]


print(train_y.count('POSITIVO'))
print(train_y.count('NEGATIVO'))
#print(train_y.count('NEUTRAL'))
print(test_y.count('POSITIVO'))
print(test_y.count('NEGATIVO'))
#print(test_y.count('NEUTRAL'))

457
457
187
187


In [52]:
train_X[10], train_y[10]

("Having been a family victim of a murder, I was fascinated in reading this novel that was written by a federal judge who has presided over a capital murder case.&#34;The Hanging Judge&#34; is the debut thriller by Michael Ponsor. It is excellent in many ways. It gives readers a view and perspective of courtroom drama in a capital murder case, along with what might and sometimes does happen behind the scenes.Personally, I am a lot like the main charater, David Norcross. I also have a lot of deep doubts about our legal system. Being a crime victim, I resent a clever defense attorney getting a criminal off the hook, when the villain is obviously very guilty.  However, I know that both the prosecutor and defense attorney are just doing the best job they can.Reading this novel was akin to watching a movie and not being able to do anything else until seeing what happens in the end. In a fiction movie or a fiction novel, writers can make their story have a good ending. Truth is, in reality, 

In [53]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer ### modulo para transformar el texto en matrices numericas de unos y zeros

vectorizacion = TfidfVectorizer() ## CountVectorizer()  TfidfVectorizer()  aumenta el peso de las palabras especificas del sentimiento

train_X_vect = vectorizacion.fit_transform(train_X)
test_X_vect = vectorizacion.transform(test_X)

In [54]:
test_X_vect

<374x9706 sparse matrix of type '<class 'numpy.float64'>'
	with 22892 stored elements in Compressed Sparse Row format>

In [55]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

cls_svc = SVC()          ### modelos clasificadores
cls_DTC = DecisionTreeClassifier()
##cls_gaus = GaussianNB()
##cls_log = LogisticRegression()

In [56]:
cls_DTC.fit(train_X_vect, train_y)

In [57]:
cls_DTC.predict(train_X_vect[0])

array(['NEGATIVO'], dtype='<U8')

In [58]:
cls_svc.fit(train_X_vect, train_y)

In [59]:
cls_svc.predict(train_X_vect[0])

array(['NEGATIVO'], dtype='<U8')

In [60]:
train_X[0],train_y[0]

("I couldn't get past the first chapter, as the way this story was written in first tense was hard to read. I'm sure once I got past the 4th or 5th chapter and finally got to the point of the story, it would have picked up a bit, but to read 15 pages and still not get anywhere, was frustrating.",
 'NEGATIVO')

In [61]:
cls_svc.score(test_X_vect, test_y)

0.839572192513369

In [62]:
cls_DTC.score(test_X_vect, test_y)

0.6711229946524064

In [63]:
from sklearn.metrics import f1_score    ### modulo para ver score de cada opcion

f1_score(test_y, cls_svc.predict(test_X_vect), average=None, labels=['NEGATIVO', 'POSITIVO'])

array([0.84293194, 0.83606557])

In [64]:
train_y.count('POSITIVO'), train_y.count('NEGATIVO') #, train_y.count('NEUTRAL')   cantidad de muestras de cada opcion

(457, 457)

In [65]:
set_pruebas = ['I hate when good books end so quickly', 'very bad', 'very entertaining this book']   ### lista de frases para aplicar prediccion
set_transformado = vectorizacion.transform(set_pruebas)                                              ### transformacion de frase a la matriz

cls_svc.predict(set_transformado)

array(['POSITIVO', 'NEGATIVO', 'POSITIVO'], dtype='<U8')

In [66]:
### mejorar el modelo SVM
from sklearn.model_selection import GridSearchCV

parametros = {
    'kernel': ('linear', 'rbf', 'poly'),
    'C': range(1, 10),
    'gamma': ('scale', 'auto')
}

cls_svc_gr = GridSearchCV(cls_svc, parametros, cv=5)
cls_svc_gr

In [67]:
cls_svc_gr.fit(train_X_vect, train_y)

In [68]:
cls_svc_gr.best_params_

{'C': 3, 'gamma': 'scale', 'kernel': 'rbf'}

In [69]:
cls_svc_gr.score(test_X_vect, test_y)

0.8422459893048129

In [70]:
import pickle                                   ## usando formado .pkl

In [71]:

with open('cls_svc_v1.pkl', 'wb') as f:
    pickle.dump(cls_svc_gr, f)                   ## transformando el modelo a un formato .pkl

In [72]:
with open('cls_svc_v1.pkl', 'rb') as f:
    cls_svc_final = pickle.load(f)               ## leyendo el modelo en formato .pkl

In [73]:
import joblib                                   ## usando formado .joblib

In [74]:
## exportando el mejor modelo
joblib.dump(cls_svc_gr, 'best_model')

['best_model']

In [75]:
## importando el mejor meodelo
model = joblib.load('best_model')

In [76]:
set_pruebas = ['very entertaining this book, I recommend it', 'i realy hate this book', 'i hate how fast i finishing the book']   ### lista de frases para aplicar prediccion
set_transformado = vectorizacion.transform(set_pruebas)    

model.predict(set_transformado)   ## modelo importado por joblib
##cls_svc_final.predict(set_transformado)  ## modelo importado por pickle

array(['POSITIVO', 'NEGATIVO', 'POSITIVO'], dtype='<U8')