# Actualización - ¿Cómo instalar la base de datos nltk?

In [None]:
pip install nltk

In [None]:
import nltk
nltk.download('movie_reviews')

# Obtención de los datos.

https://www.nltk.org/

In [None]:
from nltk.corpus import movie_reviews

nltk.download()

http://www.nltk.org/nltk_data/

In [None]:
movie_reviews.categories()

In [None]:
movie_reviews.fileids()

In [None]:
movie_reviews.words('neg/cv000_29416.txt')

In [None]:
movie_reviews.words('neg/cv002_17424.txt')

# Preparación de los datos.

https://www.nltk.org/book/ch06.html

In [None]:
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

### Mezclado aleatorio de los documentos.
https://docs.python.org/3/library/random.html

In [None]:
import random
random.seed(100)
random.shuffle(documents)

In [None]:
len(documents)

In [None]:
len(documents[0])

In [None]:
documents[0]

In [None]:
text, opinion = documents[0]
print(text)
print()
print(opinion)

# Identifiquemos las palabras más utilizadas.

In [None]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
all_words

In [None]:
len(all_words)

In [None]:
word_features = list(all_words)[:5000]
word_features

In [None]:
len(word_features)

# Función para extraer cuales de las 2000 palabras existen en una reseña.

In [None]:
def document_features(documents):
    document_words = set(documents)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [None]:
document_features(documents[0][0])

# Repitamos lo anterior para todas las reseñas.

In [None]:
featuresets = [(document_features(d), c) for (d,c) in documents]

In [None]:
featuresets[0]

In [None]:
featuresets[0][1]

In [None]:
len(featuresets)

In [None]:
type(featuresets[0])

# Separación en conjunto de Entrenamiento y Prueba.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_set, test_set = train_test_split(featuresets, 
                                       test_size = 0.10, random_state = 100)

In [None]:
print(len(train_set))
print()
print(len(test_set))

# Clasificador Naive-Bayes.

### Entrenando el Clasificador.

https://www.nltk.org/_modules/nltk/classify/naivebayes.html

In [None]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
classifier.most_informative_features(20)

In [None]:
classifier.show_most_informative_features(20)

### Probemos el Funcionamiento del Clasificador.

In [None]:
test_set[0]

In [None]:
classifier.classify(test_set[0][0])

# Matriz de confusión y reporte de clasificación.

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

### Etiquetas de entrenamiento.

In [None]:
y_train = []

for element in train_set:
    y_train.append(element[1])
    
y_train

In [None]:
y_train_predict = []

for element in train_set:
    review = classifier.classify(element[0])
    y_train_predict.append(review)
    
y_train_predict

### Comparación entre las etiquetas reales de los datos de Entrenamiento y las etiquetas encontradas por el clasificador.

In [None]:
cm = confusion_matrix(y_train, y_train_predict, labels=["pos", "neg"])

In [None]:
labels=["pos", "neg"]

sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g', 
            xticklabels=labels, yticklabels=labels)

plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de entrenamiento')
plt.show()

In [None]:
print(classification_report(y_train, y_train_predict))

### Etiquetas de prueba.

In [None]:
y_test = []

for element in test_set:
    y_test.append(element[1])
    
y_test

In [None]:
y_test_predict = []

for element in test_set:
    review = classifier.classify(element[0])
    y_test_predict.append(review)
    
y_test_predict

### Comparación entre las etiquetas reales de los datos de Prueba y las etiquetas encontradas por el clasificador.

In [None]:
cm = confusion_matrix(y_test, y_test_predict, labels=["pos", "neg"])

In [None]:
labels=["pos", "neg"]

sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g', 
            xticklabels=labels, yticklabels=labels)

plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de entrenamiento')
plt.show()

In [None]:
print(classification_report(y_test, y_test_predict))