# Tokenizer

In [3]:

sentences = ["Soy muy proclive a enroscarme en soluciones que no funcionan", "Tengo que cambiar eso"]


sentences[0].split()

['Soy',
 'muy',
 'proclive',
 'a',
 'enroscarme',
 'en',
 'soluciones',
 'que',
 'no',
 'funcionan']

In [14]:

from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

tokenizer.index_word
tokenizer.texts_to_sequences(sentences)

[[2, 3, 4, 5, 6, 7, 8, 1, 9, 10], [11, 1, 12, 13]]

In [12]:

tokenizer = Tokenizer(num_words=6, oov_token='')
tokenizer.fit_on_texts(sentences)
tokenizer.texts_to_sequences(sentences)

[[3, 4, 5, 1, 1, 1, 1, 2, 1, 1], [1, 2, 1, 1]]

# Padding

In [13]:
# prompt: un par de ejemplos cortos y faciles para ilustrar el padding de keras mostrando truncatin, padding, maxlen

from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example with padding
padded_sentences = pad_sequences(tokenizer.texts_to_sequences(sentences))
print("Padding only:")
print(padded_sentences)

# Example with padding and maxlen
padded_sentences_maxlen = pad_sequences(tokenizer.texts_to_sequences(sentences), maxlen=10)
print("\nPadding and maxlen=10:")
print(padded_sentences_maxlen)

# Example with padding, maxlen, and truncating
padded_sentences_truncating = pad_sequences(tokenizer.texts_to_sequences(sentences), maxlen=5, truncating='post')
print("\nPadding, maxlen=5, and truncating='post':")
padded_sentences_truncating


Padding only:
[[3 4 5 1 1 1 1 2 1 1]
 [0 0 0 0 0 0 1 2 1 1]]

Padding and maxlen=10:
[[3 4 5 1 1 1 1 2 1 1]
 [0 0 0 0 0 0 1 2 1 1]]

Padding, maxlen=5, and truncating='post':


array([[3, 4, 5, 1, 1],
       [0, 1, 2, 1, 1]], dtype=int32)

# Lemma stemm stopwords

Stemming

In [14]:
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('spanish')

In [15]:
for s in sentences:
  for w in sentences[0].split():
      print(f"{w} --> {stemmer.stem(w)}")

Soy --> soy
muy --> muy
proclive --> procliv
a --> a
enroscarme --> enrosc
en --> en
soluciones --> solucion
que --> que
no --> no
funcionan --> funcion
Soy --> soy
muy --> muy
proclive --> procliv
a --> a
enroscarme --> enrosc
en --> en
soluciones --> solucion
que --> que
no --> no
funcionan --> funcion


Lemmatazing

In [2]:
!python -m spacy download es_core_news_sm

import spacy

nlp = spacy.load('es_core_news_sm')

Collecting es-core-news-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
doc = nlp(sentences[0])
type(doc)

spacy.tokens.doc.Doc

In [18]:
for s in sentences:
  for w in doc:
    lemma = w.lemma_
    print(f"{w} --> {lemma}")

Soy --> ser
muy --> mucho
proclive --> proclive
a --> a
enroscarme --> enroscar yo
en --> en
soluciones --> solución
que --> que
no --> no
funcionan --> funcionar
Soy --> ser
muy --> mucho
proclive --> proclive
a --> a
enroscarme --> enroscar yo
en --> en
soluciones --> solución
que --> que
no --> no
funcionan --> funcionar


Stopwords

In [19]:
stopwordlist = nlp.Defaults.stop_words
# print(stopwordlist)

# Ejercicio video 4

In [6]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/eduardofc/data/main/amazon_home.csv")
df.head()

# prompt: asignar a una nueva columna badproduct 1 si es que la calficacion es 1 o  2 estrellas y 0 en otros casos. quedarme luego solo con esa columna y review body

df['badproduct'] = df['stars'].apply(lambda x: 1 if x in [1, 2] else 0)

df = df[['badproduct', 'review_body']]

print(df.head())

   badproduct                                        review_body
0           1  Jamás me llegó y el vendedor nunca contacto co...
1           1  Pone que son 4 piezas y la realidad es que es ...
2           1  Saltan los plomos al tercer día de uso. A devo...
3           1  No me ha gustado de hecho la devolví . Súper p...
4           1  Por más que busque, no le encuentro el agujero...


**Preprocessing**

lematizar

In [7]:
def lematize(txt):
  doc = nlp(txt)
  return[w.lemma_ for w in doc]

df['review_body'] = df['review_body'].apply(lematize)
df.head()

Unnamed: 0,badproduct,review_body
0,1,"[jamás, yo, llegar, y, el, vendedor, nunca, co..."
1,1,"[poner, que, ser, 4, pieza, y, el, realidad, s..."
2,1,"[saltar, el, plomos, al, tercer, día, de, uso,..."
3,1,"[no, yo, haber, gustar, de, hecho, el, devolví..."
4,1,"[por, más, que, buscar, ,, no, él, encontrar, ..."


remove stopwords

In [8]:
def remove_stopwords(text):
    stopwords = nlp.Defaults.stop_words
    filtered_text = [w for w in text if w not in stopwords ]
    return filtered_text

# que curiosoo como podemos aaplicar una funcion a una columna de un dataframe en pandas
df['review_body'] = df['review_body'].apply(remove_stopwords)
df.head()

Unnamed: 0,badproduct,review_body
0,1,"[jamás, llegar, vendedor, contacto, intentar é..."
1,1,"[4, pieza, realidad, 3, pieza]"
2,1,"[saltar, plomos, tercer, ., devolver]"
3,1,"[gustar, devolví, ., súper, pesado, manejar]"
4,1,"[buscar, ,, encontrar, agujero, meter, chuch, ..."


normalizacion

In [9]:
import re

def clean_text_list(text_list):
  # This regex keeps letters (including Spanish), numbers, and some basic punctuation
  cleaned_list = []
  for text in text_list:
    cleaned_text = re.sub(r'[^a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ.,!?; ]', '', text.lower())
    cleaned_list.append(cleaned_text)
  return cleaned_list

df['review_body'] = df['review_body'].apply(clean_text_list)
df.head()

Unnamed: 0,badproduct,review_body
0,1,"[jamás, llegar, vendedor, contacto, intentar é..."
1,1,"[4, pieza, realidad, 3, pieza]"
2,1,"[saltar, plomos, tercer, ., devolver]"
3,1,"[gustar, devolví, ., súper, pesado, manejar]"
4,1,"[buscar, ,, encontrar, agujero, meter, chuch, ..."


In [10]:
df.reset_index(drop=True, inplace=True)
df.tail()

Unnamed: 0,badproduct,review_body
26957,0,"[esperado, ,, toalla, .]"
26958,0,"[venir, explicado, descripción, ., ocupar, col..."
26959,0,"[sartén, antiadherente, tipo, comida, ,, tamañ..."
26960,0,"[llego, ,, super, fácil, montar, ,, calidad, ,..."
26961,0,"[súper, !, brocha, ,, caer, pelito, ,, chula, xd]"


Clasificacion de textos

In [12]:
sentences_bad_products = df[df.badproduct == 1]['review_body'].tolist()
sentences_good_products = df[df.badproduct == 0]['review_body'].tolist()

In [59]:
voc_length = 30

tokenizer_bad = Tokenizer(num_words=voc_length)
tokenizer_bad.fit_on_texts(sentences_bad_products)

tokenizer_good = Tokenizer(num_words=voc_length)
tokenizer_good.fit_on_texts(sentences_good_products)

bad_reviews_tokenized = tokenizer_bad.texts_to_sequences(sentences_bad_products)
godd_reviews_tokenized = tokenizer_good.texts_to_sequences(sentences_good_products)

In [60]:
data_bad = []
for tokens in bad_reviews_tokenized:
  row = [1 if i in tokens else 0 for i in range(1, voc_length+1)]
  data_bad.append(row)

column_names = [f'bad{i}' for i in range(1, voc_length+1)]

df_bad = pd.DataFrame(data, columns=column_names)
df_bad.head()


Unnamed: 0,bad1,bad2,bad3,bad4,bad5,bad6,bad7,bad8,bad9,bad10,...,bad21,bad22,bad23,bad24,bad25,bad26,bad27,bad28,bad29,bad30
0,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
data_good = []
for tokens in godd_reviews_tokenized:
  row = [1 if i in tokens else 0 for i in range(1, voc_length+1)]
  data_good.append(row)

column_names = [f'good{i}' for i in range(1, voc_length+1)]
df_good = pd.DataFrame(data, columns=column_names)
df_good.head()

Unnamed: 0,good1,good2,good3,good4,good5,good6,good7,good8,good9,good10,...,good21,good22,good23,good24,good25,good26,good27,good28,good29,good30
0,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
print(len(df_bad))
print(len(df_good))

10933
26962


In [62]:
df_final = pd.merge(df_bad, df_good, left_index=True, right_index=True, how="inner")
df_final.head()

Unnamed: 0,bad1,bad2,bad3,bad4,bad5,bad6,bad7,bad8,bad9,bad10,...,good21,good22,good23,good24,good25,good26,good27,good28,good29,good30
0,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
df_final['y'] = df.badproduct


In [66]:

print(df.badproduct.unique())
print(df_final.y.unique())

[1 0]
[1 0]


In [69]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split


X = df_final.drop('y', axis=1)
y = df_final['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = SVC(degree=3,kernel="poly",)
model.fit(X_train, y_train)
print("Model score:", model.score(X_test, y_test))

Model score: 0.6877729257641921


In [70]:
model.fit(X,y)
model.score(X,y)

0.7405951712521056