In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import random

In [2]:
archivo = 'libro.json'   ### cargar el archivo .json 

In [3]:
class Comentario:   ### clases para ver caracteristicas de los comentarios

    def __init__(self, text, score):     ### caracteristicas de los comentarios
        self.text = text
        self.score = score
        self.sentimiento = self.sentimiento()

    def sentimiento(self):   ### clasificacion del sentimiento segun calificacion
        if self.score <=2:
            return 'NEGATIVO'
        #elif self.score == 3:
            #return 'NEUTRAL'
        else:  ## >= 3
            return 'POSITIVO'
        
class Container:

    def __init__(self, comentario):       ### ingresa un conjunto de comentarios
        self.comentario = comentario

    def distribucion(self):               ### ve la cantidad de negativos y positivos en la cadena de comentarios
        negativo = list(filter(lambda x: x.sentimiento == 'NEGATIVO', self.comentario))
        positivo = list(filter(lambda x: x.sentimiento == 'POSITIVO', self.comentario))
        #neutral = list(filter(lambda x: x.sentimiento == 'NEUTRAL', self.comentario))

        positivo_filt = positivo[:len(negativo)]
        self.comentario = negativo + positivo_filt #+ neutral
        random.shuffle(self.comentario)        ### los compentarios se mesclan se con el largo de la cantidad de comentarios negativos para que sea uniforme


        #print(len(negativo))
        #print(len(positivo))
        #print(len(neutral))

    def get_text(self):                            ### toma el texto y el sentimiento para entrenamiento
        return [x.text for x in self.comentario]

    def get_sentiemiento(self):
        return [x.sentimiento for x in self.comentario]

In [4]:
dato = []

with open(archivo) as file:        ### envia todas las lineas de el archivo .json a una tupla 
    for line in file:
        datos = json.loads(line)   ### json.load  para cargar un archivo, json.loads para cargar muchas cadenas 
        dato.append(Comentario(datos['reviewText'], datos['overall']))

      

In [5]:
dato[60].text, dato[60].score, dato[60].sentimiento


("This was a very quick to the point read. It had a lot of plot packed into it. It was pretty fast paced. Definitely not boring & had a bit of a twist. And as of now, I really, really don't like Lilly...I agree with the Krissy K. Even though they are short, the books are well worth .99. And so far the next book comes out pretty quickly. Sooo, I don't mind the stinking cliffhanger as much, lol!",
 4.0,
 'POSITIVO')

In [6]:
len(dato)

10000

In [7]:
train_set, test_set = train_test_split(dato, test_size=0.30)   ### separacion de los datos de entrenamiento y prueba segun la distribucion de la clase Container


train_container = Container(train_set)
test_container = Container(test_set)

train_container.distribucion()
test_container.distribucion()

In [8]:
train_set[60].text, train_set[60].score, train_set[60].sentimiento

("I nearly didn't purchase this novella based on the title alone, but I've seen recommendations for this author and thought it would be a good opportunity to sample her work, and I must say I'm pleased that I did. I enjoyed this novella very much, l liked the fact that Fabrizio wasn't a complete jerk and it was obvious pretty early that he had feelings for Jenna. I like the fact that Jenna had a backbone and that when she agreed to be with him again it was on her terms. Well worth the purchase price, I'll be looking at other books by this author.",
 4.0,
 'POSITIVO')

In [9]:
train_X = train_container.get_text()          ##[x.text for x in train_set]
train_y = train_container.get_sentiemiento()  ##[x.sentimiento for x in train_set]
test_X = test_container.get_text()            ##[x.text for x in test_set]
test_y = test_container.get_sentiemiento()    ##[x.sentimiento for x in test_set]


print(train_y.count('POSITIVO'))
print(train_y.count('NEGATIVO'))
#print(train_y.count('NEUTRAL'))
print(test_y.count('POSITIVO'))
print(test_y.count('NEGATIVO'))
#print(test_y.count('NEUTRAL'))

467
467
177
177


In [10]:
train_X[10], train_y[10]

("A narcissist of the first order. I feel sorry for her kids. Don't contribute to her inexplicable wealth by buying this book.",
 'NEGATIVO')

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer ### modulo para transformar el texto en matrices numericas de unos y zeros

vectorizacion = TfidfVectorizer() ## CountVectorizer()  TfidfVectorizer()  aumenta el peso de las palabras especificas del sentimiento

train_X_vect = vectorizacion.fit_transform(train_X)
test_X_vect = vectorizacion.transform(test_X)

In [12]:
test_X_vect

<354x9873 sparse matrix of type '<class 'numpy.float64'>'
	with 21178 stored elements in Compressed Sparse Row format>

In [13]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

cls_svc = SVC()          ### modelos clasificadores
cls_DTC = DecisionTreeClassifier()
##cls_gaus = GaussianNB()
##cls_log = LogisticRegression()

In [14]:
cls_DTC.fit(train_X_vect, train_y)

In [15]:
cls_DTC.predict(train_X_vect[0])

array(['POSITIVO'], dtype='<U8')

In [16]:
cls_svc.fit(train_X_vect, train_y)

In [17]:
cls_svc.predict(train_X_vect[0])

array(['POSITIVO'], dtype='<U8')

In [18]:
train_X[0],train_y[0]

("Another great read from Baldacci.  I admit,  I didn't pick the end until it was being revealed to us.Great action, fast moving story line. There is never a dull moment with King and Maxwell. Thank goodness not a book of silly romance.  This book will be enjoyed by male and female readers.  Have another couple of books standing by from this author.  Hope you enjoy this book as much as I did.",
 'POSITIVO')

In [19]:
cls_svc.score(test_X_vect, test_y)

0.8135593220338984

In [20]:
cls_DTC.score(test_X_vect, test_y)

0.6242937853107344

In [21]:
from sklearn.metrics import f1_score    ### modulo para ver score de cada opcion

f1_score(test_y, cls_svc.predict(test_X_vect), average=None, labels=['NEGATIVO', 'POSITIVO'])

array([0.82446809, 0.80120482])

In [22]:
train_y.count('POSITIVO'), train_y.count('NEGATIVO') #, train_y.count('NEUTRAL')   cantidad de muestras de cada opcion

(467, 467)

In [23]:
set_pruebas = ['I hate when good books end so quickly', 'very bad', 'very entertaining this book']   ### lista de frases para aplicar prediccion
set_transformado = vectorizacion.transform(set_pruebas)                                              ### transformacion de frase a la matriz

cls_svc.predict(set_transformado)

array(['POSITIVO', 'NEGATIVO', 'POSITIVO'], dtype='<U8')

In [24]:
### mejorar el modelo SVM
from sklearn.model_selection import GridSearchCV

parametros = {
    'kernel': ('linear', 'rbf', 'poly'),
    'C': range(1, 10),
    'gamma': ('scale', 'auto')
}

cls_svc_gr = GridSearchCV(cls_svc, parametros, cv=5)
cls_svc_gr

In [25]:
cls_svc_gr.fit(train_X_vect, train_y)

In [26]:
cls_svc_gr.best_params_

{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}

In [27]:
cls_svc_gr.score(test_X_vect, test_y)

0.8135593220338984

In [28]:
import pickle                                   ## usando formado .pkl

In [29]:

with open('cls_svc_v1.pkl', 'wb') as f:
    pickle.dump(cls_svc_gr, f)                   ## transformando el modelo a un formato .pkl

In [30]:
with open('cls_svc_v1.pkl', 'rb') as f:
    cls_svc_final = pickle.load(f)               ## leyendo el modelo en formato .pkl

In [31]:
import joblib                                   ## usando formado .joblib

In [32]:
## exportando el mejor modelo
joblib.dump(cls_svc_gr, 'best_model_ingles')

['best_model_ingles']

In [33]:
## importando el mejor meodelo
model = joblib.load('best_model_ingles')

In [35]:
joblib.dump(vectorizacion, 'vect_fit_ingles')

['vect_fit_ingles']

In [34]:
set_pruebas = ['very entertaining this book']   ### lista de frases para aplicar prediccion
set_transformado = vectorizacion.transform(set_pruebas)    

resultado = model.predict(set_transformado)   ## modelo importado por joblib
##cls_svc_final.predict(set_transformado)  ## modelo importado por pickle

if resultado == 'POSITIVO':
    print('Positivo')
elif resultado == 'NEGATIVO':
    print('Negativo')

Positivo


In [36]:
## importa libreria
import joblib

## importando el mejor meodelo y vectorizador
model = joblib.load('best_model_ingles')
vectorizacion_ingles = joblib.load('vect_fit_ingles')

## lista de frases para aplicar prediccion
set_pruebas = ['very entertaining this book']   
set_transformado = vectorizacion_ingles.transform(set_pruebas)

## prediccion del modelo
resultado = model.predict(set_transformado)   

if resultado == 'POSITIVO':
    print('Positivo')
elif resultado == 'NEGATIVO':
    print('Negativo')

Positivo
