In [1]:
import pandas as pd
from collections import defaultdict
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Paso 1: Cargar y preparar los datos
df1 = pd.read_csv('snips_only_intent.csv')
df1 = df1.drop(['Unnamed: 0'], axis=1)
df1 = pd.DataFrame(df1)
df, df2 = train_test_split(df1, test_size=0.2, random_state=42)

In [3]:
df1[df1['sentences'] == 'need a table now somewhere nearby Petit Manan National Wildlife Refuge']

Unnamed: 0,sentences,labels
5174,need a table now somewhere nearby Petit Manan ...,get_weather


In [4]:
df1[df1['sentences'] == 'On oct. 26, I will need to make reservations to eat in Halibut Point State Park.']

Unnamed: 0,sentences,labels
4861,"On oct. 26, I will need to make reservations t...",get_weather


In [32]:
label_mapping = {
    'add_to_playlist': 0,
    'rate_book': 1,
    'get_weather': 2,
    'book_restaurant': 3,
    'play_music': 4,
    'search_creative_work': 5,
    'search_screening_event': 6
}

# Convertir etiquetas a números según la correspondencia
df['labels'] = df['labels'].map(label_mapping)

In [33]:
# Preprocesar las oraciones y crear el vocabulario
def preprocess(sentence):
    return sentence.lower().split()

In [34]:
# Crear el vocabulario basado en las etiquetas
vocab = defaultdict(lambda: defaultdict(int))

for _, row in df.iterrows():
    words = preprocess(row['sentences'])
    label = row['labels']
    for word in words:
        vocab[label][word] += 1

In [35]:
# Crear una lista de todas las palabras únicas
unique_words = list(set(word for label in vocab for word in vocab[label]))
labels = sorted(vocab.keys())
word_index = {word: idx for idx, word in enumerate(unique_words)}

In [36]:
# Inicializar la matriz de ocurrencias
occurrence_matrix = np.zeros((len(labels), len(unique_words)), dtype=int)


In [37]:
# Llenar la matriz de ocurrencias
for label in labels:
    for word, count in vocab[label].items():
        occurrence_matrix[label][word_index[word]] = count

In [38]:
# Convertir la matriz a un DataFrame para una mejor visualización
occurrence_df = pd.DataFrame(occurrence_matrix, index=labels, columns=unique_words)
print(occurrence_df.transpose())

            0  1  2  3  4  5  6
sonning     0  0  0  2  0  0  0
ritchie     1  0  0  0  0  0  0
vipers      0  0  0  0  0  1  0
pastwatch:  0  1  0  0  0  0  0
edge        1  1  0  0  1  1  1
...        .. .. .. .. .. .. ..
norman      1  0  0  0  0  0  0
2036?       0  0  2  0  0  0  0
lenin       0  0  0  0  0  1  0
highland    0  0  1  0  0  0  0
bells       0  0  0  0  1  1  0

[12008 rows x 7 columns]


In [39]:
ocurrence = occurrence_df.transpose()

In [40]:
add_to_playlist = ocurrence.sort_values(by=0, ascending=False)
rate_book = ocurrence.sort_values(by=1, ascending=False)
get_weather = ocurrence.sort_values(by=2, ascending=False)
book_restaurant = ocurrence.sort_values(by=3, ascending=False)
play_music = ocurrence.sort_values(by=4, ascending=False)
search_creative_work = ocurrence.sort_values(by=5, ascending=False)
search_screening_event = ocurrence.sort_values(by=6, ascending=False)

In [41]:
add_to_playlist

Unnamed: 0,0,1,2,3,4,5,6
to,1424,273,180,395,241,240,144
add,1344,6,0,0,0,0,0
my,876,38,66,113,55,29,9
playlist,771,1,0,0,64,0,0
the,599,947,1058,340,532,1482,1348
...,...,...,...,...,...,...,...
anguilla?,0,0,1,0,0,0,0
purchase,0,0,0,0,0,12,0
sergei,0,0,0,0,1,0,0
innocence,0,0,0,0,0,1,0


In [42]:
rate_book

Unnamed: 0,0,1,2,3,4,5,6
of,100,1180,68,192,69,365,157
the,599,947,1058,340,532,1482,1348
rate,2,856,0,0,0,1,0
6,2,796,9,52,0,1,4
a,143,632,110,1913,307,369,140
...,...,...,...,...,...,...,...
scantlin,1,0,0,0,0,0,0
dynamite,0,0,0,0,0,1,0
hoffman,0,0,1,0,0,0,0
shore,0,0,1,0,0,0,0


In [43]:
get_weather

Unnamed: 0,0,1,2,3,4,5,6
in,135,50,1206,1082,28,92,278
the,599,947,1058,340,532,1482,1348
weather,0,0,743,0,0,0,0
be,11,10,641,2,7,3,30
is,103,50,594,49,13,65,451
...,...,...,...,...,...,...,...
joann's,1,0,0,0,0,0,0
whee,1,0,0,0,0,0,0
itsu,0,0,0,1,0,0,0
hugh,1,0,0,0,1,0,0


In [44]:
book_restaurant

Unnamed: 0,0,1,2,3,4,5,6
a,143,632,110,1913,307,369,140
for,34,95,476,1246,14,258,458
in,135,50,1206,1082,28,92,278
book,2,244,4,1047,1,66,0
at,9,16,230,833,0,26,629
...,...,...,...,...,...,...,...
remembered,0,0,0,0,0,0,1
race,0,2,0,0,0,0,1
titled,8,10,0,0,1,29,0
klute,1,0,0,0,0,0,0


In [45]:
play_music

Unnamed: 0,0,1,2,3,4,5,6
play,9,0,0,0,1488,92,14
from,14,22,162,115,539,25,42
the,599,947,1058,340,532,1482,1348
by,95,4,22,21,488,14,44
on,116,14,200,131,481,40,19
...,...,...,...,...,...,...,...
mashed,0,0,0,1,0,0,0
opening,0,0,0,0,0,0,1
simplest,0,1,0,0,0,0,0
"bonaventure,",0,0,0,1,0,0,0


In [46]:
search_creative_work

Unnamed: 0,0,1,2,3,4,5,6
the,599,947,1058,340,532,1482,1348
find,3,3,1,56,5,724,360
show,4,1,29,0,0,449,165
a,143,632,110,1913,307,369,140
of,100,1180,68,192,69,365,157
...,...,...,...,...,...,...,...
robb,0,0,1,0,0,0,0
jarnowick,1,0,0,0,0,0,0
municipal,0,0,0,3,0,0,0
mopreme,1,0,0,0,0,0,0


In [47]:
search_screening_event

Unnamed: 0,0,1,2,3,4,5,6
the,599,947,1058,340,532,1482,1348
movie,0,1,0,0,2,62,851
at,9,16,230,833,0,26,629
what,3,6,474,1,1,31,493
for,34,95,476,1246,14,258,458
...,...,...,...,...,...,...,...
calientes,1,0,0,0,0,0,0
principle,0,1,0,0,0,0,0
stories,2,7,0,0,0,1,0
processing,0,2,0,0,0,0,0


In [48]:
#Obtener las palabras mas significativas de add_to_playlist
add_to_playlist['Sum_other_columns'] = add_to_playlist.sum(axis=1)
add_to_playlist['Porcentaje'] = add_to_playlist[0]/ add_to_playlist['Sum_other_columns']
add_to_playlist_filtro = add_to_playlist[add_to_playlist['Porcentaje']>0.7]
add_to_playlist_filtro = add_to_playlist_filtro.drop('Sum_other_columns',axis=1)
add_to_playlist_filtro = add_to_playlist_filtro.sort_values(by=0, ascending=False)
add_to_playlist_filtro

Unnamed: 0,0,1,2,3,4,5,6,Porcentaje
add,1344,6,0,0,0,0,0,0.995556
my,876,38,66,113,55,29,9,0.738617
playlist,771,1,0,0,64,0,0,0.922249
playlist.,255,0,0,0,16,0,0,0.940959
put,186,3,0,0,3,1,0,0.963731
...,...,...,...,...,...,...,...,...
berryz,1,0,0,0,0,0,0,1.000000
arena,1,0,0,0,0,0,0,1.000000
lessie's,1,0,0,0,0,0,0,1.000000
harlow,1,0,0,0,0,0,0,1.000000


In [49]:
#Obtener las palabras mas significativas de rate_book
rate_book['Sum_other_columns'] = rate_book.sum(axis=1)
rate_book['Porcentaje'] = rate_book[1]/ rate_book['Sum_other_columns']
rate_book_filtro = rate_book[rate_book['Porcentaje']>0.7]
rate_book_filtro = rate_book_filtro.drop('Sum_other_columns',axis=1)
rate_book_filtro = rate_book_filtro.sort_values(by=1, ascending=False)
rate_book_filtro



Unnamed: 0,0,1,2,3,4,5,6,Porcentaje
rate,2,856,0,0,0,1,0,0.996508
6,2,796,9,52,0,1,4,0.921296
out,36,578,5,1,6,4,3,0.913112
give,2,544,26,1,1,3,49,0.869010
stars,2,425,0,0,1,4,3,0.977011
...,...,...,...,...,...,...,...,...
occult,0,1,0,0,0,0,0,1.000000
marxism,0,1,0,0,0,0,0,1.000000
spears,0,1,0,0,0,0,0,1.000000
braindead,0,1,0,0,0,0,0,1.000000


In [50]:
#Obtener las palabras mas significativas de get_weather
get_weather['Sum_other_columns'] = get_weather.sum(axis=1)
get_weather['Porcentaje'] = get_weather[2]/ get_weather['Sum_other_columns']
get_weather_filtro = get_weather[get_weather['Porcentaje']>0.7]
get_weather_filtro = get_weather_filtro.drop('Sum_other_columns',axis=1)
get_weather_filtro = get_weather_filtro.sort_values(by=2, ascending=False)
get_weather_filtro


Unnamed: 0,0,1,2,3,4,5,6,Porcentaje
weather,0,0,743,0,0,0,0,1.000000
be,11,10,641,2,7,3,30,0.910511
will,4,2,569,6,5,2,29,0.922204
it,9,14,504,0,6,11,2,0.923077
forecast,0,0,465,0,0,0,0,1.000000
...,...,...,...,...,...,...,...,...
sarygamyş,0,0,1,0,0,0,0,1.000000
4/19/2030,0,0,1,0,0,0,0,1.000000
07:43:21,0,0,1,0,0,0,0,1.000000
1/11/2040?,0,0,1,0,0,0,0,1.000000


In [51]:
#Obtener las palabras mas significativas de book_restaurant
book_restaurant['Sum_other_columns'] = book_restaurant.sum(axis=1)
book_restaurant['Porcentaje'] = book_restaurant[3]/ book_restaurant['Sum_other_columns']
book_restaurant_filtro = book_restaurant[book_restaurant['Porcentaje']>0.7]
book_restaurant_filtro = book_restaurant_filtro.drop('Sum_other_columns',axis=1)
book_restaurant_filtro = book_restaurant_filtro.sort_values(by=3, ascending=False)
book_restaurant_filtro


Unnamed: 0,0,1,2,3,4,5,6,Porcentaje
book,2,244,4,1047,1,66,0,0.767595
restaurant,0,0,4,588,0,0,0,0.993243
table,1,0,1,428,0,0,0,0.995349
that,1,11,0,241,15,12,48,0.734756
need,8,1,24,225,8,26,21,0.718850
...,...,...,...,...,...,...,...,...
parry,0,0,0,1,0,0,0,1.000000
verro,0,0,0,1,0,0,0,1.000000
cusinie,0,0,0,1,0,0,0,1.000000
floating,0,0,0,1,0,0,0,1.000000


In [52]:
#Obtener las palabras mas significativas de play_music
play_music['Sum_other_columns'] = play_music.sum(axis=1)
play_music['Porcentaje'] = play_music[4]/ play_music['Sum_other_columns']
play_music_filtro = play_music[play_music['Porcentaje']>0.7]
play_music_filtro = play_music_filtro.drop('Sum_other_columns',axis=1)
play_music_filtro = play_music_filtro.sort_values(by=4, ascending=False)
play_music_filtro


Unnamed: 0,0,1,2,3,4,5,6,Porcentaje
play,9,0,0,0,1488,92,14,0.928260
by,95,4,22,21,488,14,44,0.709302
music,35,1,0,0,433,12,1,0.898340
some,15,2,0,4,243,3,7,0.886861
hear,0,0,0,0,125,9,1,0.925926
...,...,...,...,...,...,...,...,...
adieu,0,0,0,0,1,0,0,1.000000
ahmed,0,0,0,0,1,0,0,1.000000
vegetables,0,0,0,0,1,0,0,1.000000
khare,0,0,0,0,1,0,0,1.000000


In [53]:
#Obtener las palabras mas significativas de search_creative_work
search_creative_work['Sum_other_columns'] = search_creative_work.sum(axis=1)
search_creative_work['Porcentaje'] = search_creative_work[5]/ search_creative_work['Sum_other_columns']
search_creative_work_filtro = search_creative_work[search_creative_work['Porcentaje']>0.7]
search_creative_work_filtro = search_creative_work_filtro.drop('Sum_other_columns',axis=1)
search_creative_work_filtro = search_creative_work_filtro.sort_values(by=5, ascending=False)
search_creative_work_filtro


Unnamed: 0,0,1,2,3,4,5,6,Porcentaje
called,37,13,0,1,0,249,2,0.824503
tv,0,0,0,0,0,159,0,1.000000
game,1,0,1,0,3,127,2,0.947761
search,0,2,0,0,1,87,1,0.956044
look,1,1,2,0,0,86,2,0.934783
...,...,...,...,...,...,...,...,...
prescription,0,0,0,0,0,1,0,1.000000
drugs,0,0,0,0,0,1,0,1.000000
swim,0,0,0,0,0,1,0,1.000000
puro,0,0,0,0,0,1,0,1.000000


In [54]:
#Obtener las palabras mas significativas de search_screening_event
search_screening_event['Sum_other_columns'] = search_screening_event.sum(axis=1)
search_screening_event['Porcentaje'] = search_screening_event[6]/ search_screening_event['Sum_other_columns']
search_screening_event_filtro = search_screening_event[search_screening_event['Porcentaje']>0.7]
search_screening_event_filtro = search_screening_event_filtro.drop('Sum_other_columns',axis=1)
search_screening_event_filtro = search_screening_event_filtro.sort_values(by=6, ascending=False)
search_screening_event_filtro


Unnamed: 0,0,1,2,3,4,5,6,Porcentaje
movie,0,1,0,0,2,62,851,0.929039
playing,3,0,0,0,5,2,401,0.975669
movies,2,0,0,0,1,0,385,0.992268
schedule,0,0,0,0,0,7,275,0.975177
are,3,3,4,2,4,7,267,0.920690
...,...,...,...,...,...,...,...,...
won,0,0,0,0,0,0,1,1.000000
rumyantsev,0,0,0,0,0,0,1,1.000000
wooly,0,0,0,0,0,0,1,1.000000
00:47:43,0,0,0,0,0,0,1,1.000000


In [68]:
print("add_to_playlist_filtro--------------------------------------------------------")
print(add_to_playlist_filtro.head(10),"\n", add_to_playlist_filtro.tail(10))
print("rate_book--------------------------------------------------------")
print(rate_book_filtro.head(10),"\n", rate_book_filtro.tail(10))
print("get_weather--------------------------------------------------------")
print(get_weather_filtro.head(10),"\n", get_weather_filtro.tail(10))
print("book_restaurant--------------------------------------------------------")
print(book_restaurant_filtro.head(10),"\n", book_restaurant_filtro.tail(10))
print("play_music--------------------------------------------------------")
print(play_music_filtro.head(10),"\n", play_music_filtro.tail(10))
print("search_creative_work--------------------------------------------------------")
print(search_creative_work_filtro.head(10),"\n", search_creative_work_filtro.tail(10))
print("search_screening_event--------------------------------------------------------")
print(search_screening_event_filtro.head(10),"\n", search_screening_event_filtro.tail(10))
print("--------------------------------------------------------")


add_to_playlist_filtro--------------------------------------------------------
              0   1   2    3   4   5  6  Porcentaje
add        1344   6   0    0   0   0  0    0.995556
my          876  38  66  113  55  29  9    0.738617
playlist    771   1   0    0  64   0  0    0.922249
playlist.   255   0   0    0  16   0  0    0.940959
put         186   3   0    0   3   1  0    0.963731
tune        168   0   0    0  37   0  0    0.819512
artist      139   2   0    0  21   1  0    0.852761
onto         77   0   0    0   0   0  0    1.000000
metal        59   0   0    0   4   0  1    0.921875
indie        46   0   0    0   7   0  0    0.867925 
              0  1  2  3  4  5  6  Porcentaje
rombola      1  0  0  0  0  0  0         1.0
checkmate    1  0  0  0  0  0  0         1.0
wilcox       1  0  0  0  0  0  0         1.0
choir        1  0  0  0  0  0  0         1.0
księga       1  0  0  0  0  0  0         1.0
berryz       1  0  0  0  0  0  0         1.0
arena        1  0  0  0  0  0  0