In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import random

In [2]:
archivo = 'libro.json'   ### cargar el archivo .json 

In [3]:
class Comentario:   ### clases para ver caracteristicas de los comentarios

    def __init__(self, text, score):     ### caracteristicas de los comentarios
        self.text = text
        self.score = score
        self.sentimiento = self.sentimiento()

    def sentimiento(self):   ### clasificacion del sentimiento segun calificacion
        if self.score <=2:
            return 'NEGATIVO'
        #elif self.score == 3:
            #return 'NEUTRAL'
        else:  ## >= 3
            return 'POSITIVO'
        
class Container:

    def __init__(self, comentario):       ### ingresa un conjunto de comentarios
        self.comentario = comentario

    def distribucion(self):               ### ve la cantidad de negativos y positivos en la cadena de comentarios
        negativo = list(filter(lambda x: x.sentimiento == 'NEGATIVO', self.comentario))
        positivo = list(filter(lambda x: x.sentimiento == 'POSITIVO', self.comentario))
        #neutral = list(filter(lambda x: x.sentimiento == 'NEUTRAL', self.comentario))

        positivo_filt = positivo[:len(negativo)]
        self.comentario = negativo + positivo_filt #+ neutral
        random.shuffle(self.comentario)        ### los compentarios se mesclan se con el largo de la cantidad de comentarios negativos para que sea uniforme


        #print(len(negativo))
        #print(len(positivo))
        #print(len(neutral))

    def get_text(self):                            ### toma el texto y el sentimiento para entrenamiento
        return [x.text for x in self.comentario]

    def get_sentiemiento(self):
        return [x.sentimiento for x in self.comentario]

In [4]:
dato = []

with open(archivo) as file:        ### envia todas las lineas de el archivo .json a una tupla 
    for line in file:
        datos = json.loads(line)   ### json.load  para cargar un archivo, json.loads para cargar muchas cadenas 
        dato.append(Comentario(datos['reviewText'], datos['overall']))

      

In [5]:
dato[60].text, dato[60].score, dato[60].sentimiento


("This was a very quick to the point read. It had a lot of plot packed into it. It was pretty fast paced. Definitely not boring & had a bit of a twist. And as of now, I really, really don't like Lilly...I agree with the Krissy K. Even though they are short, the books are well worth .99. And so far the next book comes out pretty quickly. Sooo, I don't mind the stinking cliffhanger as much, lol!",
 4.0,
 'POSITIVO')

In [6]:
len(dato)

10000

In [7]:
train_set, test_set = train_test_split(dato, test_size=0.30)   ### separacion de los datos de entrenamiento y prueba segun la distribucion de la clase Container


train_container = Container(train_set)
test_container = Container(test_set)

train_container.distribucion()
test_container.distribucion()

In [8]:
train_set[60].text, train_set[60].score, train_set[60].sentimiento

("A Dangerous Element: Gregory S. LambColonel Mark Coolhand Reynolds is behind closed doors in a psych ward being given meds that will hopefully erase his memory. Many other combat soldiers both officers and enlisted men are among the patients in this hospital being given these drugs. Entering the halls you see them carrying a special project notebook having to add their thoughts and notes before begin allowed to leave their daily meeting. As each person leaves he/she is told to stand in line and get their meds. But, Mark is smart, realizes what is happening and hides the meds within his cheeks and never swallows anything. The year is 2011 the month is December as the story flashes back several years to where it all began. Just what does the government want him and others to forget? Why are they going to such lengths to keep these people you might say in captivity? What will happen if he finally escapes? A Dangerous Element is the work of retired USAF Colonel Gregory S. Lamb and will b

In [9]:
train_X = train_container.get_text()          ##[x.text for x in train_set]
train_y = train_container.get_sentiemiento()  ##[x.sentimiento for x in train_set]
test_X = test_container.get_text()            ##[x.text for x in test_set]
test_y = test_container.get_sentiemiento()    ##[x.sentimiento for x in test_set]


print(train_y.count('POSITIVO'))
print(train_y.count('NEGATIVO'))
#print(train_y.count('NEUTRAL'))
print(test_y.count('POSITIVO'))
print(test_y.count('NEGATIVO'))
#print(test_y.count('NEUTRAL'))

459
459
185
185


In [10]:
train_X[10], train_y[10]

('A heartwarming story about a desolate town &the preparation for Hollywood to come & make them famous. With the need for more women, an old relationship was rekindled & New ones started.  I enjoyed reading this book.',
 'POSITIVO')

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer ### modulo para transformar el texto en matrices numericas de unos y zeros

vectorizacion = TfidfVectorizer() ## CountVectorizer()  TfidfVectorizer()  aumenta el peso de las palabras especificas del sentimiento

train_X_vect = vectorizacion.fit_transform(train_X)
test_X_vect = vectorizacion.transform(test_X)

In [12]:
test_X_vect

<370x9612 sparse matrix of type '<class 'numpy.float64'>'
	with 22250 stored elements in Compressed Sparse Row format>

In [13]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

cls_svc = SVC()          ### modelos clasificadores
cls_DTC = DecisionTreeClassifier()
##cls_gaus = GaussianNB()
##cls_log = LogisticRegression()

In [14]:
cls_DTC.fit(train_X_vect, train_y)

In [15]:
cls_DTC.predict(train_X_vect[0])

array(['NEGATIVO'], dtype='<U8')

In [16]:
cls_svc.fit(train_X_vect, train_y)

In [17]:
cls_svc.predict(train_X_vect[0])

array(['NEGATIVO'], dtype='<U8')

In [18]:
train_X[0],train_y[0]

("If you like James Patterson's books, I say read this one, especially If you have read NYPD.  Personally, I read a lot of his books.  This is one of my favorite series at this time.  Love the characters and just when you think you have figured out  the plot, then it will turn in different direction for big surprise.  Enjoy!!!",
 'NEGATIVO')

In [19]:
cls_svc.score(test_X_vect, test_y)

0.8054054054054054

In [20]:
cls_DTC.score(test_X_vect, test_y)

0.6324324324324324

In [21]:
from sklearn.metrics import f1_score    ### modulo para ver score de cada opcion

f1_score(test_y, cls_svc.predict(test_X_vect), average=None, labels=['NEGATIVO', 'POSITIVO'])

array([0.81818182, 0.79069767])

In [22]:
train_y.count('POSITIVO'), train_y.count('NEGATIVO') #, train_y.count('NEUTRAL')   cantidad de muestras de cada opcion

(459, 459)

In [23]:
set_pruebas = ['I hate when good books end so quickly', 'very bad', 'very entertaining this book']   ### lista de frases para aplicar prediccion
set_transformado = vectorizacion.transform(set_pruebas)                                              ### transformacion de frase a la matriz

cls_svc.predict(set_transformado)

array(['POSITIVO', 'NEGATIVO', 'POSITIVO'], dtype='<U8')

In [24]:
### mejorar el modelo SVM
from sklearn.model_selection import GridSearchCV

parametros = {
    'kernel': ('linear', 'rbf', 'poly'),
    'C': range(1, 10),
    'gamma': ('scale', 'auto')
}

cls_svc_gr = GridSearchCV(cls_svc, parametros, cv=5)
cls_svc_gr

In [25]:
cls_svc_gr.fit(train_X_vect, train_y)

In [26]:
cls_svc_gr.best_params_

{'C': 1, 'gamma': 'scale', 'kernel': 'linear'}

In [27]:
cls_svc_gr.score(test_X_vect, test_y)

0.8054054054054054

In [28]:
import pickle                                   ## usando formado .pkl

In [29]:

with open('cls_svc_v1.pkl', 'wb') as f:
    pickle.dump(cls_svc_gr, f)                   ## transformando el modelo a un formato .pkl

In [30]:
with open('cls_svc_v1.pkl', 'rb') as f:
    cls_svc_final = pickle.load(f)               ## leyendo el modelo en formato .pkl

In [31]:
import joblib                                   ## usando formado .joblib

In [37]:
## exportando el mejor modelo
joblib.dump(cls_svc_gr, 'best_model_ingles')

['best_model_ingles']

In [38]:
## importando el mejor meodelo
model = joblib.load('best_model_ingles')

In [41]:
set_pruebas = ['very entertaining this book']   ### lista de frases para aplicar prediccion
set_transformado = vectorizacion.transform(set_pruebas)    

resultado = model.predict(set_transformado)   ## modelo importado por joblib
##cls_svc_final.predict(set_transformado)  ## modelo importado por pickle

if resultado == 'POSITIVO':
    print('Positivo')
elif resultado == 'NEGATIVO':
    print('Negativo')

Positivo
