# Practico 5 (parte 4)

## Entrenar word embeddings

## Importación de módulos y librerías

In [1]:
# Inclusion de librerias y módulos
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Usamos las stopwords definidas en nltk más algunas propias
from nltk.corpus import stopwords

stopwords = stopwords.words('english') + [',', "’", '.', ':', '-', ';']

# Algunas utilidades
from utiles import print_some_info

# Nos permite convertir str a list
from ast import literal_eval

# Colores
BLUE   = '#5DADE2'
RED    = '#ff7043'
ORANGE = '#F5B041'
GREEN  = '#58D68D'
YELLOW = '#F4D03F'
pltcolors = [BLUE, RED, ORANGE, GREEN, YELLOW]

# Plot axes y legends parambs
plt.rcParams["axes.labelweight"]   = "bold"
plt.rcParams["axes.titleweight"]   = "bold"
plt.rcParams["legend.shadow"]      = True
plt.rcParams["figure.titleweight"] = "bold"

data_dir = os.path.join('..', 'dataset')

## Lectura del archivo de mensajes
Utilizamos unicamente el archivo de mensajes dado que vamos a entrenar un word embeding como word2vec. Entendemos que para el propósito del análisis y por que no estamos empleando ningún modelo de clasificación o regresión podemos usar el conjunto de datos completo.

In [2]:
filename = 'dev_yup_messages_preprocessed.csv'
df = pd.read_csv(os.path.join(data_dir, filename))

print(f'El conjunto de datos utilizado es {filename}')
print_some_info(df)


El conjunto de datos utilizado es dev_yup_messages_preprocessed.csv
El conjunto de datos posee 234375 filas y 6 columnas
&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
RangeIndex: 234375 entries, 0 to 234374
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   session_id    234375 non-null  int64 
 1   created_at    234375 non-null  object
 2   sent_from     234375 non-null  object
 3   sent_to       234375 non-null  object
 4   content_type  234375 non-null  object
 5   text          234375 non-null  object
dtypes: int64(1), object(5)
memory usage: 10.7+ MB
None


In [3]:
import re
import emot
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

# Funcion para convertir emojis a palabras
def convert_emojis(text):
    text0 = [t for t in re.findall(r'\\x..\\x..\\x..\\x..', str(text.encode()))]
    text0 = [''.join(t.split('\\x')[1:]) for t in text0]
    text0 = [bytes.fromhex(t).decode() for t in text0 if t[0]=='f']
    text0 = [UNICODE_EMO[t] for t in text0 if t in UNICODE_EMO]
    text0 = ' '.join(text0)
    return text0 if text0 else text

# Funcion para convertir emoticones a palabras
OUREMOTIC = dict([(e, f":{EMOTICONS[e].lower().split(',')[0].replace('or ','').replace(' ','_')}:") for e in EMOTICONS.keys()])
def convert_emoticons(text):
    try:
        text0 = emot.emoticons(text)
        if text0['flag']:
            return ':'+text0['mean'][0].replace(' ', '_').lower()+':'
        return text
    except Exception as e:
        return text

In [5]:
#1. Tomamos solo las columnas que nos pueden servir. Esto es preliminar, podríamos tomar solo `text`
dfclean = df[['session_id', 'sent_from', 'text']]

#2. Tomamos solo las filas que sean tutor o student a partir de la columna `sent_from`
dfclean = dfclean[dfclean.sent_from.isin(['student', 'tutor'])]

#3. Convertimos a lista de strings el contenido de la columna text
dfclean['text'] = dfclean.text.apply(lambda x: literal_eval(x))

#4. Se sustituyen emojis por tokens 
dfclean['text'] = dfclean.text.apply(lambda x: [convert_emojis(w) for w in x])

#5. Se sustituyen emoticones por palabras
## No lo vamos a tratar por ahora por que requiere de un mejor tratamiento. Los parentesis, llaves y corchetes parece que el uso regular afecta al manejo del emoticon. 
# dfclean['text'] = dfclean.text.apply(lambda x: [convert_emoticons(w) for w in x])

#6. Convernitimos a minúsculas para unificar el tratamiento
dfclean['text'] = dfclean.text.apply(lambda x: [w.lower() for w in x])

#7. Removemos las stopwords
dfclean['text'] = dfclean.text.apply(lambda x: [w for w in x if w not in stopwords])

print_some_info(dfclean)

El conjunto de datos posee 210242 filas y 3 columnas
&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
Int64Index: 210242 entries, 0 to 234374
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   session_id  210242 non-null  int64 
 1   sent_from   210242 non-null  object
 2   text        210242 non-null  object
dtypes: int64(1), object(2)
memory usage: 6.4+ MB
None


In [8]:
dfclean.head(50)

Unnamed: 0,session_id,sent_from,text
0,299895,tutor,"[welcome, yup, jasmyn, !]"
1,299895,tutor,"[may, help, math, today, ?]"
3,299899,student,[<url>]
5,299899,tutor,"[hello, luke, welcome, yup, !]"
6,299899,tutor,"[looking, problem]"
7,299899,student,"[36sqrt2, throwing, beginning, solve, problem]"
8,299899,student,"[36sqrt2, throwing, beginning, solve, problem]"
9,299899,tutor,"[okay, worries, let, 's, work, problem, together]"
10,299899,student,"[math, complete, 30, 60, 90, triangle, theorem..."
11,299899,student,"[math, complete, 30, 60, 90, triangle, theorem..."


In [31]:
print(df.loc[195289].text)
print(df.loc[18686].text)
print(df.loc[28969].text)
print(df.loc[43681].text)
print(df.loc[32716].text)
print(df.loc[25706].text)

[&#39;Alright&#39;, &#39;👌&#39;]
[&#39;Hi&#39;, &#39;👋&#39;]
[&#39;Hi&#39;, &#39;👋&#39;]
[&#39;👍&#39;, &#39;Okay&#39;, &#39;!&#39;]
[&#39;Picture&#39;, &#39;😄&#39;]
[&#39;Nothing&#39;, &#39;😂&#39;]


In [37]:
print(dfclean.loc[195289].text)
print(dfclean.loc[18686].text)
print(dfclean.loc[28969].text)
print(dfclean.loc[43681].text)
print(dfclean.loc[32716].text)
print(dfclean.loc[25706].text)

[&#39;Alright&#39;, &#39;:OK_hand:&#39;]
[&#39;Hi&#39;, &#39;:waving_hand:&#39;]
[&#39;Hi&#39;, &#39;:waving_hand:&#39;]
[&#39;:thumbs_up:&#39;, &#39;Okay&#39;, &#39;!&#39;]
[&#39;Picture&#39;, &#39;:smiling_face_with_open_mouth_&amp;_smiling_eyes:&#39;]
[&#39;Nothing&#39;, &#39;:face_with_tears_of_joy:&#39;]


In [57]:
def convert_emoticons(text):
    text0 = emot.emoticons(text)
    if text0['flag']:
        return ':'+text0['mean'][0].replace(' ', '_').lower()+':'
    return text

text = ["Hello", ":-]", ":-)"]
[convert_emoticons(t) for t in text]


[&#39;Hello&#39;, &#39;:happy_face_or_smiley:&#39;, &#39;:happy_face_smiley:&#39;]

In [3]:
stopwords

[&#39;i&#39;,
 &#39;me&#39;,
 &#39;my&#39;,
 &#39;myself&#39;,
 &#39;we&#39;,
 &#39;our&#39;,
 &#39;ours&#39;,
 &#39;ourselves&#39;,
 &#39;you&#39;,
 &quot;you&#39;re&quot;,
 &quot;you&#39;ve&quot;,
 &quot;you&#39;ll&quot;,
 &quot;you&#39;d&quot;,
 &#39;your&#39;,
 &#39;yours&#39;,
 &#39;yourself&#39;,
 &#39;yourselves&#39;,
 &#39;he&#39;,
 &#39;him&#39;,
 &#39;his&#39;,
 &#39;himself&#39;,
 &#39;she&#39;,
 &quot;she&#39;s&quot;,
 &#39;her&#39;,
 &#39;hers&#39;,
 &#39;herself&#39;,
 &#39;it&#39;,
 &quot;it&#39;s&quot;,
 &#39;its&#39;,
 &#39;itself&#39;,
 &#39;they&#39;,
 &#39;them&#39;,
 &#39;their&#39;,
 &#39;theirs&#39;,
 &#39;themselves&#39;,
 &#39;what&#39;,
 &#39;which&#39;,
 &#39;who&#39;,
 &#39;whom&#39;,
 &#39;this&#39;,
 &#39;that&#39;,
 &quot;that&#39;ll&quot;,
 &#39;these&#39;,
 &#39;those&#39;,
 &#39;am&#39;,
 &#39;is&#39;,
 &#39;are&#39;,
 &#39;was&#39;,
 &#39;were&#39;,
 &#39;be&#39;,
 &#39;been&#39;,
 &#39;being&#39;,
 &#39;have&#39;,
 &#39;has&#39;,
 &#39;had&#39;,
 &#3