# Librerias

In [2]:
# Importamos las librerias necesarias para el correcto funcionamiento del codigo
import nltk
import numpy as np
import pandas as pd
import random
import re
import string
import unicodedata
import pickle
from nltk.corpus import stopwords
from nltk import tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Pre-procesamiento de datos

In [3]:
#Creamos una variable df con el dataset
df = pd.read_csv("haha_2021_train.csv")

In [4]:
df.drop(["votes_no","votes_1", "votes_2", "votes_3", "votes_4", "votes_5","humor_mechanism","humor_target"],
          axis=1,
          inplace=True)

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\te512362\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
stop_words_sp = set(stopwords.words('spanish'))
stop_words_en = set(stopwords.words('english'))
#Concatenar las stopwords aplicándose a una cuenta que genera contenido en inglés y español
stop_words = stop_words_sp | stop_words_en

In [7]:
def LimpiarTexto(tweet):
    stopwords_english = stopwords.words('spanish')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # quita el "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet) 
    # quita hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet) 
    # quita hashtags dejando la palabra sin el signo # 
    tweet = re.sub(r'#', '', tweet) 
    tweet = re.sub(r'[^\w\s]','',tweet) #Signos de puntuacion 
    tweet = tweet.lower() #Minusculas
    tweet = tweet.replace("ñ", "#") #Reemplaza 'ñ'
    tweet = unicodedata.normalize("NFKD", tweet)\
    .encode("ascii","ignore").decode("ascii")\
    .replace("#", "ñ") #Quita acentos
   
    eliminar_stopwords(tweet, stopwords_english)
 
    return tweet

def eliminar_stopwords(texto, stopwords):
    return ' '.join([word for word in texto.split(' ') if word not in stopwords])

def Tokenizacion(tweet):
  tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
  tweet_tokens = tokenizer.tokenize(tweet)
  return tweet_tokens

def Stemming(tweet):
  stemmer = SnowballStemmer("spanish")
  tweets_clean = []
  for word in tweet:
      stem_word = stemmer.stem(word)  # stemming word
      tweets_clean.append(stem_word)

  return tweets_clean

In [8]:
df = df.assign(CleanText="",TokenizeText="", Text="")

In [9]:
for i in range(len(df)):
  df["CleanText"][i]  = LimpiarTexto(df['text'][i])
  df["TokenizeText"][i] = Tokenizacion(df['CleanText'][i])
  df["Text"][i] = Stemming(df['TokenizeText'][i])

df['TokenizeText'] = df.TokenizeText.astype(str)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [10]:
df = df.assign(Corpus="")

In [11]:
x = []
for i in range(len(df)):
  x ="".join(df['TokenizeText'][i])
  df["Corpus"][i] = x

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


# Split Data

In [12]:
#Divide los datos en train y test
from sklearn.model_selection import train_test_split
y = df['is_humor']
x = df['Corpus']
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.3)

In [13]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(train_y)
Test_Y = Encoder.fit_transform(test_y)

# MLFlow

In [24]:
import mlflow
import mlflow.sklearn
track_uri = "http://localhost:5000/" # Esto puede ser que cambie por http://0.0.0.0:1234
mlflow.set_tracking_uri(track_uri)
mlflow.set_registry_uri("sqlite:////tmp/registry.db")


In [25]:
# Generando el experimento o cargandolo si existe
experiment_name = "Topicos"
mlflow.set_experiment(experiment_name)

# Cargando la información
client = mlflow.tracking.MlflowClient()
experiment_id = client.get_experiment_by_name(experiment_name).experiment_id


# Vamos a ver si es cierto
print(f"MLflow Version: {mlflow.__version__}")
print(f"Tracking URI: {mlflow.tracking.get_tracking_uri()}")
print(f"Nombre del experimento: {experiment_name}")
print(f"ID del experimento: {experiment_id}")

MLflow Version: 1.23.1
Tracking URI: http://localhost:5000/
Nombre del experimento: Topicos
ID del experimento: 2


In [27]:
mlflow.sklearn.autolog()
ngram_r = (1,2)
max_f = 100
Tfidf_vect = TfidfVectorizer(ngram_range=ngram_r, max_features=max_f)
Tfidf_vect.fit(df['Corpus'])
Train_X_Tfidf = Tfidf_vect.transform(train_x)
Test_X_Tfidf = Tfidf_vect.transform(test_x)
len(Tfidf_vect.vocabulary_)


with open('tfidf_vect.pkl', 'wb') as file:  
    pickle.dump(Tfidf_vect, file)

params = {'solver':'lbfgs'}
mlflow.log_params(params)
mlflow.log_artifact("tfidf_vect.pkl", "Vectores")
mlflow.log_metrics(
    {'ngram_param_1': ngram_r[0],'ngram_param_2':ngram_r[1] , 'max_features':max_f}
)

modelLogisticRegression = LogisticRegression(**params)

modelLogisticRegression.fit(Train_X_Tfidf,train_y)

# mlflow.sklearn.log_model(modelLogisticRegression, artifact_path="sklearn-model")

metrics = mlflow.sklearn.eval_and_log_metrics(modelLogisticRegression, Test_X_Tfidf, test_y, prefix="val_")
mlflow.end_run()

In [17]:
X = 'Van dos ciegos y le dice uno al otro: -Ojala lloviera... -Ojala yo tambien...'
prediccion_X = Tfidf_vect.transform([X])
modelLogisticRegression.predict(prediccion_X)

array([1], dtype=int64)