# Extracción de características

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['This is the first document.','This is the second second document.', 'And the third one.', 'Is this the first document?']
count_vect = CountVectorizer() 
X_train_counts = count_vect.fit_transform(corpus) 
print(X_train_counts.shape)
print(count_vect.get_feature_names())
print(X_train_counts.toarray())

(4, 9)
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]


In [2]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
X_train_brigrams = bigram_vectorizer.fit_transform(corpus) 
print(X_train_brigrams.shape)
print(bigram_vectorizer.vocabulary_)
print(X_train_brigrams.toarray())

(4, 21)
{'this': 18, 'is': 5, 'the': 12, 'first': 3, 'document': 2, 'this is': 19, 'is the': 6, 'the first': 13, 'first document': 4, 'second': 9, 'the second': 14, 'second second': 11, 'second document': 10, 'and': 0, 'third': 16, 'one': 8, 'and the': 1, 'the third': 15, 'third one': 17, 'is this': 7, 'this the': 20}
[[0 0 1 1 1 1 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0]
 [0 0 1 0 0 1 1 0 0 2 1 1 1 0 1 0 0 0 1 1 0]
 [1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 1 0 0 0]
 [0 0 1 1 1 1 0 1 0 0 0 0 1 1 0 0 0 0 1 0 1]]


# Tf-idf

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
X_train_tfidf=tfidf_vect.fit_transform(corpus)
print(X_train_tfidf.shape)
print(tfidf_vect.get_feature_names())
print(X_train_tfidf.toarray())

(4, 9)
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]
 [0.         0.27230147 0.         0.27230147 0.         0.85322574
  0.22262429 0.         0.27230147]
 [0.55280532 0.         0.         0.         0.55280532 0.
  0.28847675 0.55280532 0.        ]
 [0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]]


# Preparación de datos

In [53]:
import pandas as pd
import numpy as np

spam_data = pd.read_csv('spam.csv',index_col=False)

spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
spam_data.head(10)

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
5,FreeMsg Hey there darling it's been 3 week's n...,1
6,Even my brother is not like to speak with me. ...,0
7,As per your request 'Melle Melle (Oru Minnamin...,0
8,WINNER!! As a valued network customer you have...,1
9,Had your mobile 11 months or more? U R entitle...,1


# División de muestras en entrenamiento (train) y prueba (test)

In [54]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], 
                                                    spam_data['target'], 
                                                    random_state=0)
print(len(X_train))

4179


### Pregunta 1

Ajustar los datos de entrenamiento `X_train` utilizando un `count_vectorizer` con parámetros predeterminados.

¿Cuál es el token más largo en el vocabulario?

*Esta función debería devolver una cadena.*

In [55]:
from sklearn.feature_extraction.text import CountVectorizer

def respuesta_uno ():
    corpus = list(X_train)
    count_vect = CountVectorizer() 
    X_train_counts = count_vect.fit_transform(corpus) 
    maxi=''
    for i in count_vect.get_feature_names():
        print(i)
        if len(i)>len(maxi):
            maxi=i
    return maxi

respuesta_uno()


00
000
000pes
008704050406
0089
0121
01223585236
01223585334
0125698789
02
0207
02072069400
02073162414
02085076972
021
03
04
0430
05
0578
06
07008009200
07099833605
07123456789
0721072
07734396839
07742676969
07753741225
0776xxxxxxx
07781482378
07786200117
077xxx
078
07808247860
07808726822
07821230901
078498
07880867867
0789xxxxxxx
07946746291
0796xxxxxx
07973788240
07xxxxxxxxx
08
0800
08000407165
08000839402
08000930705
08000938767
08001950382
08002888812
08002986030
08002986906
08006344447
0808
08081263000
08081560665
0825
0844
08448350055
08448714184
0845
08452810071
08452810073
08452810075over18
0870
08700435505150p
08700469649
08700621170150p
08701213186
08701237397
08701417012
08701417012150p
087016248
08701752560
087018728737
0870241182716
08702840625
08704050406
08704439680
08704439680ts
08706091795
08707509020
08707808226
08708034412
08708800282
08709222922
0871
087104711148
08712101358
08712103738
0871212025016
08712300220
087123002209am
08712317606
08712400602450p
08712400

loan
loans
local
location
locations
locaxx
lodging
log
logging
login
logo
logoff
logon
logos
loko
lol
lolnice
lololo
londn
london
lonely
long
longer
lonlines
loo
look
looked
lookin
looking
looks
lool
looovvve
loose
loosing
loosu
lor
lord
lose
losers
loses
losing
loss
lost
lot
lotr
lots
lotsly
lotta
lotto
lotz
loud
lounge
lousy
lov
lovable
love
loved
lovejen
lovely
loveme
lover
loverboy
lovers
loves
loving
lovingly
lovly
low
lower
lowes
loxahatchee
loyal
loyalty
ls1
ls15hb
lt
ltd
ltdhelpdesk
lubly
luck
luckily
lucky
lucozade
lucy
lucyxx
lul
lunch
lunchtime
lunsford
lush
luton
luv
luvd
luvs
lux
luxury
lvblefrnd
lyf
lyfu
lying
lyk
lyricalladie
lyrics
m221bp
m227xy
m26
m263uz
m39m51
m6
m60
m8
m8s
m95
ma
maaaan
maangalyam
maat
mac
macedonia
macha
machan
machi
machines
macho
mack
macs
mad
mad1
mad2
madam
made
madodu
madoke
madstini
madurai
mag
maga
magazine
magic
magical
magicalsongs
mah
mahal
maid
mail
mailbox
mailed
mails
main
maintain
major
make
makes
makin
making
malaria
malarky
male
mal

'com1win150ppmx3age16subscription'

### Pregunta 2

¿Cuál es el número promedio de caracteres por documento para los documentos no spam y spam?

*Esta función debe devolver una tupla (promedio de # caracteres no es spam, promedio # caracteres spam).*

In [7]:
import pandas as pd
def respuesta_dos():
    s_c_sp=0
    s_c_nsp=0
    tot_sp=0
    tot_nsp=0
    for indice_fila, fila in spam_data.iterrows():
        if fila['target']==1:
            tot_sp+=1
            s_c_sp+=len(fila['text'])
        else:
            tot_nsp+=1
            s_c_nsp+=len(fila['text'])
    return (s_c_nsp/tot_nsp, s_c_sp/tot_sp)
respuesta_dos()


(71.02362694300518, 138.8661311914324)

### Pregunta 3

¿Cuál es el número promedio de dígitos por documento para los documentos no spam y spam?

*Esta función debe devolver una tupla (promedio de # dígitos no es spam, promedio # dígitos spam).*

In [8]:
import re
def respuesta_tres():
    s_c_sp=0
    s_c_nsp=0
    tot_sp=0
    tot_nsp=0
    pattern1=re.compile('\d')
    for indice_fila, fila in spam_data.iterrows():
        if fila['target']==1:
            tot_sp+=1
            s_c_sp+=len(pattern1.findall(fila['text']))
        else:
            tot_nsp+=1
            s_c_nsp+=len(pattern1.findall(fila['text']))    
    return (s_c_nsp/tot_nsp, s_c_sp/tot_sp)
respuesta_tres()


(0.2992746113989637, 15.759036144578314)

### Pregunta 4

¿Cuál es el número promedio de caracteres que no son palabras (cualquier cosa que no sea una letra, un dígito o un guión bajo) por documento para los documentos que no son spam y spam?

*Sugerencia: utilice las clases de caracteres `\ w` y` \ W`*

*Esta función debe devolver una tupla (promedio de # caracteres que no son palabras, no spam, promedio de # caracteres que no son palabras, spam).*

In [9]:
def pregunta_cuatro():
    s_c_sp=0
    s_c_nsp=0
    tot_sp=0
    tot_nsp=0
    pattern1=re.compile('\W')
    for indice_fila, fila in spam_data.iterrows():
        if fila['target']==1:
            tot_sp+=1
            s_c_sp+=len(pattern1.findall(fila['text']))
        else:
            tot_nsp+=1
            s_c_nsp+=len(pattern1.findall(fila['text']))

    
    return (s_c_nsp/tot_nsp, s_c_sp/tot_sp)
pregunta_cuatro()

(17.29181347150259, 29.041499330655956)

### Pregunta 5

¿Cuál es el tamaño del vocabulario en `X_train` y `X_test`, primero utilizando la función `fit_transform` en ambos (train y test), luego utilizando `fit_transform` sobre el train y solo `transform` en el test



*Esta función debe devolver dos tuplas una con `fit_transform` y la otra con `transform` (vocabulario en `X_train`, vocabulario en `X_test`), (vocabulario en `X_train`, vocabulario en `X_test`).*.*

In [16]:
def respuesta_cinco():
    count_vect = CountVectorizer()
    X_train_count = count_vect.fit_transform(X_train)
    t1 = len(count_vect.get_feature_names())
    X_test_count = count_vect.fit_transform(X_test)
    t2 = len(count_vect.get_feature_names())
    
    count2 = CountVectorizer()
    X_train_count1 = count2.fit_transform(X_train)
    t3 = len(count2.get_feature_names())
    X_test_count1 = count2.transform(X_test)
    t4 = len(count2.get_feature_names())
    return ((t1,t2),(t3,t4))

respuesta_cinco()

((7354, 4170), (7354, 7354))

### Pregunta 6

¿Cuales son las 10 palabras mas frecuentes (sin tener en cuenta *Stopwords*) en los documentos que no son spam y spam?


*Esta función debe devolver una tupla (palabras mas frecuentes, no spam, palabras mas frecuentes, spam).*

In [77]:
import nltk
from nltk.corpus import stopwords
list_stop = list(set(stopwords.words('english')))

def pregunta_seis():
    spam = spam_data[spam_data['target'] == 0]['text']
    no_spam = spam_data[spam_data['target']==1]['text']
    
    count_vect = CountVectorizer()
    
    #Obtengo las frecuencias de palabras en documentos spam
    spam_count = count_vect.fit_transform(spam)
    palabras_spam = count_vect.get_feature_names()
    frecuentes_spam = spam_count.toarray().sum(axis=0)
    dic_spam = dict(zip(palabras_spam,frecuentes_spam))
    
    #Obtengo las frecuencias de las palabras de documentos no spam
    no_spam_count = count_vect.fit_transform(no_spam)
    palabras_no_spam = count_vect.get_feature_names()
    frecuentes_no_spam = no_spam_count.toarray().sum(axis=0)
    dic_no_spam = dict(zip(palabras_no_spam,frecuentes_no_spam))
    
    #Quito las Stopwords 
    for xs in list_stop:
        try:
            dic_no_spam.pop(xs)
            dic_spam.pop(xs)
        except:
            pass   
        
    return (ordenaDicFrec(dic_no_spam)[0:10], ordenaDicFrec(dic_spam)[0:10])



def ordenaDicFrec(dicfrec):
    aux = [(dicfrec[key], key) for key in dicfrec]
    aux.sort()
    aux.reverse()
    return aux

pregunta_seis()

([(355, 'call'),
  (224, 'free'),
  (163, 'txt'),
  (144, 'ur'),
  (127, 'mobile'),
  (125, 'text'),
  (121, 'stop'),
  (113, 'claim'),
  (104, 'reply'),
  (98, 'www')],
 [(318, 'gt'),
  (316, 'lt'),
  (305, 'get'),
  (287, 'ok'),
  (249, 'go'),
  (241, 'ur'),
  (236, 'know'),
  (236, 'call'),
  (233, 'good'),
  (232, 'like')])

### Pregunta 7

¿Cuales son las 10 palabras mas frecuentes (solo teniendo en cuenta *Stopwords*) en los documentos que no son spam y spam?


*Esta función debe devolver una tupla (palabras mas frecuentes, no spam, palabras mas frecuentes, spam).*

In [92]:

def pregunta_siete():
    
    spam = spam_data[spam_data['target'] == 0]['text']
    no_spam = spam_data[spam_data['target']==1]['text']
    
    count_vect = CountVectorizer()
    
    spam_count = count_vect.fit_transform(spam)
    palabras_spam = count_vect.get_feature_names()
    frecuentes_spam = spam_count.toarray().sum(axis=0)
    dic_spam = dict(zip(palabras_spam,frecuentes_spam))
    
    count_vect1 = CountVectorizer()
    no_spam_count = count_vect1.fit_transform(no_spam)
    palabras_no_spam = count_vect1.get_feature_names()
    frecuentes_no_spam = no_spam_count.toarray().sum(axis=0)
    dic_no_spam = dict(zip(palabras_no_spam,frecuentes_no_spam))
    
    #Solo obtengo las StopWords
    
    palabras_spam =[]
    ocurrencia_spam = []
    palabras_no_spam = []
    ocurrencia_no_spam = []
    for xs in list_stop:
        
        valor_spam = dic_spam.get(xs)
        if valor_spam is not None:
            palabras_spam.append(xs)
            ocurrencia_spam.append(valor_spam)
        else:
            pass

        valor_no_spam = dic_no_spam.get(xs)
        if valor_no_spam is not None:
            palabras_no_spam.append(xs)
            ocurrencia_no_spam.append(valor_no_spam)
        else:
            pass
           
    spam = dict(zip(palabras_spam, ocurrencia_spam))
    no_spam = dict(zip(palabras_no_spam,ocurrencia_no_spam))
    return (ordenaDicFrec(no_spam)[0:10], ordenaDicFrec(spam)[0:10])



def ordenaDicFrec(dicfrec):
    aux = [(dicfrec[key], key) for key in dicfrec]
    aux.sort()
    aux.reverse()
    return aux
 
    
    return 
pregunta_siete()

([(688, 'to'),
  (297, 'you'),
  (264, 'your'),
  (206, 'the'),
  (203, 'for'),
  (199, 'now'),
  (188, 'or'),
  (158, 'is'),
  (144, 'on'),
  (135, 'have')],
 [(1943, 'you'),
  (1554, 'to'),
  (1122, 'the'),
  (857, 'and'),
  (818, 'in'),
  (772, 'me'),
  (750, 'my'),
  (732, 'is'),
  (711, 'it'),
  (551, 'that')])