In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
!pwd

/content


In [4]:
WORKING_PATH = '/content/drive/MyDrive/NLP'

In [5]:
%cd {WORKING_PATH}

/content/drive/MyDrive/NLP


In [7]:
!pip install -r requirements.txt

Collecting num2words (from -r requirements.txt (line 9))
  Downloading num2words-0.5.14-py3-none-any.whl.metadata (13 kB)
Collecting plotly_express (from -r requirements.txt (line 12))
  Downloading plotly_express-0.4.1-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting pyDAWG (from -r requirements.txt (line 13))
  Downloading pyDAWG-1.0.1.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyLDAvis (from -r requirements.txt (line 14))
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting sklearn_crfsuite (from -r requirements.txt (line 17))
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting stop_words (from -r requirements.txt (line 18))
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jedi>=0.16 (from ipython->-r requirements.txt (line 3))
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting docopt>=0.6.2 (from num2w

In [9]:
# Importar librerías necesarias
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix, classification_report

# Cargar el archivo de datos prepocesados
df = pd.read_csv('/content/drive/MyDrive/NLP/datos_prepocesados.csv')
print(df.head())

# Función para asignar etiquetas binarias
def asignar_etiqueta(overall):
    if overall >= 4:  # Ajustar el umbral
        return 1
    else:
        return 0

# Aplicamos la función para crear la etiqueta binaria
df['sentiment_label'] = df['overall'].apply(asignar_etiqueta)

# Verificar y rellenar valores NaN en 'reviewTextProcesado'
if df['reviewTextProcesado'].isnull().any():
    print("Hay valores NaN en 'reviewTextProcesado' antes de la división.")
    df['reviewTextProcesado'] = df['reviewTextProcesado'].fillna('')  # Rellena los NaN
    print("NaNs rellenados. Verificando de nuevo:")
    if df['reviewTextProcesado'].isnull().any():
        print("A pesar de rellenar, siguen existiendo NaNs.")
    else:
        print("Ya no hay NaNs.")
else:
    print("No hay NaNs en 'reviewTextProcesado' antes de la división.")

# Preparar las sentencias y etiquetas
sentences = df['reviewTextProcesado']
labels = df['sentiment_label'].values

x_train, x_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=42, shuffle=True, stratify=labels)
print('Train instances:', len(x_train))
print('Train labels:', len(y_train))
print('Examples:', y_train[0])

print('Test instances:', len(x_test))
print('Test labels:', len(y_test))
print('Example:', y_test[0])

label_counts = Counter(y_train)
print(f'Label:0, Count: {label_counts[0]}')
print(f'Label:1, Count: {label_counts[1]}')
print(f'Proportion:{label_counts.get(1)/label_counts.get(0)}')

label_counts = Counter(y_test)
print(f'Label:0, Count: {label_counts[0]}')
print(f'Label:1, Count: {label_counts[1]}')
print(f'Proportion:{label_counts.get(1)/label_counts.get(0)}')

# Entrenamiento
features_extractor = [
    ['unigramas', CountVectorizer(ngram_range=(1,1), lowercase=True, max_features=1000)],
    ['unigramas_v1', CountVectorizer(ngram_range=(1,1), max_df=0.95,lowercase=True,max_features=1000)],
    ['bigramas', CountVectorizer(ngram_range=(2,2), lowercase=True, max_features=1000)],
    ['trigramas', CountVectorizer(ngram_range=(3,3), lowercase=True, max_features=1000)],
    ['tf_idf', TfidfVectorizer(lowercase=True)]
]

# Definimos clasificadores
estimators = [
    ['knn', KNeighborsClassifier(n_neighbors=5, n_jobs=2)],
    ['Knn_v1', KNeighborsClassifier(n_neighbors=3, n_jobs=2)],
    ['bayes', GaussianNB()],
    ['rf', RandomForestClassifier(random_state=42, n_jobs=2)],
    ['svm', LinearSVC(random_state=42)],
]

results = {}
for feature_name, feature_model in features_extractor:
    print(f'Testing {feature_name}')
    results[feature_name] = {}
    train_features = feature_model.fit_transform(x_train)
    test_features = feature_model.transform(x_test)

    if feature_name != 'tf_idf':
        train_features = normalize(train_features, axis=1)
        test_features = normalize(test_features, axis=1)

    _x_train = train_features.toarray() if not isinstance(train_features, np.ndarray) else list(train_features)
    _x_test = test_features.toarray() if not isinstance(test_features, np.ndarray) else list(test_features)

    for estimator_name, estimator_model in estimators:
        print(f'Testing {estimator_name}')
        estimator_model.fit(_x_train, y_train)
        pred_labels = estimator_model.predict(_x_test)
        res = classification_report(y_test, pred_labels, output_dict=True)
        results[feature_name][estimator_name] = res
        del estimator_model
        print('t\t\t', str(confusion_matrix(y_test, pred_labels)).replace('\n', '\n\t\t\t'))
        print('\t\t\t', classification_report(y_test, pred_labels).replace('\n', '\n\t\t\t'))


   overall  verified   reviewTime      reviewerID        asin  \
0        5      True   09 1, 2016  A3CIUOJXQ5VDQ2  B0000530HU   
1        5      True  11 14, 2013  A3H7T87S984REU  B0000530HU   
2        1      True  08 18, 2013  A3J034YH7UG4KT  B0000530HU   
3        5     False   05 3, 2011  A2UEO5XR3598GI  B0000530HU   
4        5      True   05 6, 2011  A3SFRT223XXWF7  B00006L9LC   

                                               style     reviewerName  \
0  {'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...         Shelly F   
1  {'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...     houserules18   
2  {'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...             Adam   
3  {'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...           Rich K   
4                          {'Size:': ' 200ml/6.7oz'}  C. C. Christian   

                                          reviewText  \
0                   As advertised. Reasonably priced   
1  Like the oder and the feel when I put it on my...   
2 