In [1]:
from pydantic import BaseModel
import re, string, unicodedata
from num2words import num2words
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.preprocessing import FunctionTransformer
import pandas as pd
import nltk
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

nltk.download('stopwords')
nltk.download('punkt')
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from joblib import dump

[nltk_data] Downloading package stopwords to C:\Users\El
[nltk_data]     supremo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\El
[nltk_data]     supremo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words


def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    return [x.lower() for x in words]

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', ' ', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = num2words(word, lang = 'es_CO')
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words


def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    stopword_es = nltk.corpus.stopwords.words('spanish')
    new_words = []

    for word  in words:
        if word not in stopword_es:
            new_words.append(word)
    return new_words
    
def preprocessing(words):
    words = to_lowercase(words)
    words = replace_numbers(words)
    words = remove_punctuation(words)
    words = remove_non_ascii(words)
    words = remove_stopwords(words)
    return words

In [9]:
def join_tokens(tokens):
    return ' '.join(tokens)

def generate_pipeLine():
    df_peliculas = pd.read_csv('./data/MovieReviews.csv', sep=',', encoding = 'utf-8')

    x_train_m2, x_test_m2, y_train_m2,  y_test_m2  = train_test_split(df_peliculas['review_es'], df_peliculas['sentimiento'], test_size=0.8, random_state=1)


    pipeline = Pipeline(
        [
            ('tokenizer', FunctionTransformer(word_tokenize)),
            ('preprocessing',FunctionTransformer(preprocessing)),
            ('join_tokens', FunctionTransformer(join_tokens)),
            ('tokenizer_2', FunctionTransformer(word_tokenize)),
            ('preprocessing_2',FunctionTransformer(preprocessing)),
            ('join_tokens_2', FunctionTransformer(join_tokens)),
            ('vectorizador', CountVectorizer(analyzer='word')),
            ('model', LogisticRegression())
        ]
    )
    pipeline.fit(x_train_m2, y_train_m2)
    dump(pipeline, 'modelo.joblib')
    print('MAE:')
    print('Train:', mean_absolute_error(y_train_m2, pipeline.predict(x_train_m2)))
    print('Test:', mean_absolute_error(y_test_m2, pipeline.predict(x_test_m2)))
    print('\nRMSE:')
    print('Train:', np.sqrt(mean_squared_error(y_train_m2, pipeline.predict(x_train_m2))))
    print('Test:', np.sqrt(mean_squared_error(y_test_m2, pipeline.predict(x_test_m2))))



In [10]:
def use_pipeline(movie):
    filename = 'modelo.joblib'
    df_recent = pd.read_csv('./data/'+movie+'.csv', sep=',', encoding = 'utf-8') # Lectura de los datos recientes
    pipeline = dump.load(filename)
    y_predicted =  pipeline.predict(df_recent)
    return y_predicted

In [11]:
print("Generado Pipeline:")
generate_pipeLine()

Generado Pipeline:


TypeError: expected string or bytes-like object