In [27]:
from textblob import TextBlob # libreria principal para NLP
import pandas as pd # importar csv
import numpy as np # manipular arrays
from sklearn.model_selection import train_test_split # datos de training y testing
from textblob.classifiers import NaiveBayesClassifier # clasificador 1
from textblob.classifiers import DecisionTreeClassifier # clasificador 2
import psutil # lectura de ram y cpu

In [2]:
# Se selecciona los datos preparados.
df = pd.read_csv("prepared_data.csv")
df

Unnamed: 0,labels,features
0,neutral,what said
1,positive,plus you ve added commercials to the experien...
2,neutral,didn today must mean need to take another trip
3,negative,it really aggressive to blast obnoxious enter...
4,negative,and it a really big bad thing about it
...,...,...
14635,positive,thank you we got on different flight to chicago
14636,negative,leaving over minutes late flight no warnings ...
14637,neutral,please bring american airlines to blackberry
14638,negative,you have my money you change my flight and do...


In [3]:
''' 
Para realizar adecuadamente la clasificacion, se cambia positiva -> pos, negative -> neg
Ademas se descarta los datos que sean neutrales.
'''

df = df[(df.labels == 'negative') | (df.labels == 'positive')]
df.labels = df.labels.apply(lambda x: 
    'pos' if(x == 'positive') else 'neg'   
);

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.labels = df.labels.apply(lambda x:


In [4]:
df

Unnamed: 0,labels,features
1,pos,plus you ve added commercials to the experien...
3,neg,it really aggressive to blast obnoxious enter...
4,neg,and it a really big bad thing about it
5,neg,seriously would pay flight for seats that did...
6,pos,yes nearly every time fly vx this ear worm wo...
...,...,...
14633,neg,my flight was cancelled flightled leaving tom...
14634,neg,right on cue with the delays
14635,pos,thank you we got on different flight to chicago
14636,neg,leaving over minutes late flight no warnings ...


In [5]:
''' 
Se observa un desbalance que impide realizar una buena clasificacion, por lo que se va a reducir los datos con etiqueta "neg", para tener mejor
balance.
'''
df.labels.value_counts()

neg    9178
pos    2363
Name: labels, dtype: int64

In [6]:
df_neg = df[df.labels == 'neg']
df_pos = df[df.labels == 'pos']
df_new = pd.concat([df_pos,df_neg[:3000]])

In [7]:
df_new.labels.value_counts()

neg    3000
pos    2363
Name: labels, dtype: int64

In [8]:
# Se crea las listas de testing y training
X_train, X_test, y_train, y_test = train_test_split(df_new.features, df_new.labels, test_size=0.2, random_state=0)

In [11]:
# barra de progreso
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r\n"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [29]:
def txtblob_formatdata(x,y):
    ''' 
    Funcion que permite generar listas con el formato requerido por Textblob para realizar la clasificacion: 
    el lemento tiene la siguiente sintaxis: [(feature),(target)]
    '''
    data = []
    for i,j in enumerate(x):
        #print(i,j)
        data.append((j,y.tolist()[i]))
    return data

In [13]:
train = txtblob_formatdata(X_train,y_train)
test = txtblob_formatdata(X_test,y_test)
print("Train lenght:",len(train))
print("Test lenght:",len(test))

Train lenght: 4290
Test lenght: 1073


In [14]:
def batches(data_list,batch_sep):
    ''' 
    Para optimizar la clasificacion, se realiza una separacion del dataset en paquetes.
    '''
    batches = []
    batch_size = int(len(data_list)/batch_sep)
    print(batch_sep,"paquetes, cada uno tiene",batch_size,"datos.")
    for i in range(batch_sep):
        data = data_list[i*batch_size:(i+1)*batch_size]
        batches.append(data)
    return batches

In [15]:
batch_train = batches(train,16) # se obtuvo una lista de 16 paquetes cada uno con 268 features
#batch_test = batches(test,16)

16 paquetes, cada uno tiene 268 datos.


In [16]:
classifier = NaiveBayesClassifier(batch_train[0]) # se construye el clasificador con datos iniciales
batch_sep = 15
for i in range(batch_sep):
    printProgressBar(i+1,batch_sep) # barra de progreso
    print('The CPU usage is: ', psutil.cpu_percent()) # cpu usage
    print('RAM memory % used:', psutil.virtual_memory()[2]) # ram usage
    classifier.update(batch_train[i+1]) # añade datos nuevos contenidos en los paquetes construidos

 |██████----------------------------------------------------------------------------------------------| 6.7% 
The CPU usage is:  60.8
RAM memory % used: 35.3
 |█████████████---------------------------------------------------------------------------------------| 13.3% 
The CPU usage is:  73.5
RAM memory % used: 34.8
 |████████████████████--------------------------------------------------------------------------------| 20.0% 
The CPU usage is:  29.7
RAM memory % used: 35.4
 |██████████████████████████--------------------------------------------------------------------------| 26.7% 
The CPU usage is:  24.1
RAM memory % used: 36.0
 |█████████████████████████████████-------------------------------------------------------------------| 33.3% 
The CPU usage is:  23.6
RAM memory % used: 36.7
 |████████████████████████████████████████------------------------------------------------------------| 40.0% 
The CPU usage is:  25.1
RAM memory % used: 37.3
 |█████████████████████████████████████████████

In [21]:
classifier # objetos entrenados

<NaiveBayesClassifier trained on 4288 instances>

In [22]:
print('RAM memory % used:', psutil.virtual_memory()[2])
print(classifier.accuracy(test)) # resultado de la clasificacion basado los datos de test construido anteriormente

RAM memory % used: 55.5
0.875116495806151


In [23]:
classifier.show_informative_features() # listado de features informativos.

Most Informative Features
       contains(jetblue) = True              pos : neg    =     34.8 : 1.0
         contains(thank) = True              pos : neg    =     19.4 : 1.0
         contains(kudos) = True              pos : neg    =     18.0 : 1.0
      contains(terrible) = True              neg : pos    =     17.8 : 1.0
     contains(excellent) = True              pos : neg    =     17.2 : 1.0
            contains(hr) = True              neg : pos    =     15.6 : 1.0
         contains(until) = True              neg : pos    =     15.6 : 1.0
        contains(online) = True              neg : pos    =     14.1 : 1.0
         contains(hours) = True              neg : pos    =     13.8 : 1.0
         contains(stuck) = True              neg : pos    =     13.5 : 1.0


In [25]:
cl_test = TextBlob('this flight was awesome!',classifier=classifier) # prueba de clasificacion con texto creado
print(cl_test.classify()) # resultado de prueba.

pos
