## TEXTO

In [1]:
import pandas as pd
import numpy as np 
import random
from sklearn.model_selection import train_test_split
import re
from unidecode import unidecode
import unicodedata
import warnings
warnings.filterwarnings("ignore")

## FUNCIONES

In [2]:
def clean_text(text, pattern="[^a-zA-Z0-9 ]"):
    cleaned_text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore')
    cleaned_text = re.sub(pattern, " ", cleaned_text.decode("utf-8"), flags=re.UNICODE)
    cleaned_text = u' '.join(cleaned_text.lower().strip().split())
    return cleaned_text

# TEXTO

- NLP (Procesamiento del lenguaje natural) es un campo de la inteligencia artificial que estudia las interacciones entre las computadoras y los lenguajes humanos, en particular cómo programar computadoras para procesar y analizar grandes cantidades de datos del lenguaje natural.

In [3]:
path="../data/ingenieria/News_Category_Dataset.csv"

In [4]:
df = pd.read_csv(path)

In [6]:
df.shape

(66624, 5)

In [5]:
df.head()

Unnamed: 0,tgt,text,authors,link,date
0,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26
1,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26
2,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26
3,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26
4,ENTERTAINMENT,Morgan Freeman 'Devastated' That Sexual Harass...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,2018-05-26


In [7]:
df["tgt"].value_counts()

POLITICS         32739
WELLNESS         17827
ENTERTAINMENT    16058
Name: tgt, dtype: int64

In [8]:
df=df.rename(columns={"text":"t_text","authors":"t_authors","date":"d_date","link":"t_link"})

In [10]:
df["t_text_ori"]=df["t_text"]

In [8]:
df

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori
0,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...
1,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 57
2,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
3,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...
4,ENTERTAINMENT,Morgan Freeman 'Devastated' That Sexual Harass...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...
...,...,...,...,...,...,...
66619,WELLNESS,The Sleep Library: 11 Soothing Books For Bedtime,,https://www.huffingtonpost.comhttp://www.oprah...,2012-01-28,The Sleep Library: 11 Soothing Books For Bedtime
66620,WELLNESS,The Benefits of Caring for a Pet,"Rita Altman, R.N., Contributor\nSenior Vice Pr...",https://www.huffingtonpost.com/entry/pets-seni...,2012-01-28,The Benefits of Caring for a Pet
66621,WELLNESS,This Is Only the Beginning: Surprising Advice ...,"Ellie Knaus, Contributor\nAtomic Moms Podcast ...",https://www.huffingtonpost.com/entry/life-tips...,2012-01-28,This Is Only the Beginning: Surprising Advice ...
66622,ENTERTAINMENT,"Sundance, Ice-T, and Shades of the American Ra...","Courtney Garcia, Contributor\nI tell stories a...",https://www.huffingtonpost.com/entry/sundance-...,2012-01-28,"Sundance, Ice-T, and Shades of the American Ra..."


## Limpieza

In [11]:
df["t_text"]=df["t_text"].astype(str)

In [12]:
df["t_text"]=df["t_text"].map(lambda x:clean_text(x, pattern="[^a-zA-Z ]"))

In [13]:
df

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori
0,ENTERTAINMENT,will smith joins diplo and nicky jam for the w...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...
1,ENTERTAINMENT,hugh grant marries for the first time at age,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 57
2,ENTERTAINMENT,jim carrey blasts castrato adam schiff and dem...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
3,ENTERTAINMENT,julianna margulies uses donald trump poop bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...
4,ENTERTAINMENT,morgan freeman devastated that sexual harassme...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...
...,...,...,...,...,...,...
66619,WELLNESS,the sleep library soothing books for bedtime,,https://www.huffingtonpost.comhttp://www.oprah...,2012-01-28,The Sleep Library: 11 Soothing Books For Bedtime
66620,WELLNESS,the benefits of caring for a pet,"Rita Altman, R.N., Contributor\nSenior Vice Pr...",https://www.huffingtonpost.com/entry/pets-seni...,2012-01-28,The Benefits of Caring for a Pet
66621,WELLNESS,this is only the beginning surprising advice f...,"Ellie Knaus, Contributor\nAtomic Moms Podcast ...",https://www.huffingtonpost.com/entry/life-tips...,2012-01-28,This Is Only the Beginning: Surprising Advice ...
66622,ENTERTAINMENT,sundance ice t and shades of the american race...,"Courtney Garcia, Contributor\nI tell stories a...",https://www.huffingtonpost.com/entry/sundance-...,2012-01-28,"Sundance, Ice-T, and Shades of the American Ra..."


## Stop words

In [15]:
#pip install nltk
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/carla/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

- Eliminar todas las palabras que no brindan información adicional. En el ejemplo, la palabra más importante es "canción" porque puede orientar cualquier modelo de clasificación en la dirección correcta. Por el contrario, palabras como "y", "para", "el" no son útiles, ya que probablemente aparecen en casi todas las observaciones del conjunto de datos.

In [16]:
df

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori
0,ENTERTAINMENT,will smith joins diplo and nicky jam for the w...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...
1,ENTERTAINMENT,hugh grant marries for the first time at age,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 57
2,ENTERTAINMENT,jim carrey blasts castrato adam schiff and dem...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
3,ENTERTAINMENT,julianna margulies uses donald trump poop bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...
4,ENTERTAINMENT,morgan freeman devastated that sexual harassme...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...
...,...,...,...,...,...,...
66619,WELLNESS,the sleep library soothing books for bedtime,,https://www.huffingtonpost.comhttp://www.oprah...,2012-01-28,The Sleep Library: 11 Soothing Books For Bedtime
66620,WELLNESS,the benefits of caring for a pet,"Rita Altman, R.N., Contributor\nSenior Vice Pr...",https://www.huffingtonpost.com/entry/pets-seni...,2012-01-28,The Benefits of Caring for a Pet
66621,WELLNESS,this is only the beginning surprising advice f...,"Ellie Knaus, Contributor\nAtomic Moms Podcast ...",https://www.huffingtonpost.com/entry/life-tips...,2012-01-28,This Is Only the Beginning: Surprising Advice ...
66622,ENTERTAINMENT,sundance ice t and shades of the american race...,"Courtney Garcia, Contributor\nI tell stories a...",https://www.huffingtonpost.com/entry/sundance-...,2012-01-28,"Sundance, Ice-T, and Shades of the American Ra..."


In [17]:
stop_words=nltk.corpus.stopwords.words("english")

In [19]:
len(stop_words)

179

In [18]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [20]:
text=df.iloc[0]["t_text"]

In [22]:
text.split(" ")

['will',
 'smith',
 'joins',
 'diplo',
 'and',
 'nicky',
 'jam',
 'for',
 'the',
 'world',
 'cup',
 's',
 'official',
 'song']

In [24]:
[x for x in text.split(" ") if x not in stop_words]

['smith', 'joins', 'diplo', 'nicky', 'jam', 'world', 'cup', 'official', 'song']

In [26]:
" ".join([x for x in text.split(" ") if x not in stop_words])

'smith joins diplo nicky jam world cup official song'

In [27]:
df.iloc[0]["t_text"]

'will smith joins diplo and nicky jam for the world cup s official song'

In [27]:
df.iloc[0]["t_text"]

'will smith joins diplo and nicky jam for the world cup s official song'

In [28]:
df["t_text"]=df["t_text"].map(lambda text:" ".join([x for x in text.split(" ") if x not in stop_words]))

In [29]:
df

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori
0,ENTERTAINMENT,smith joins diplo nicky jam world cup official...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...
1,ENTERTAINMENT,hugh grant marries first time age,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 57
2,ENTERTAINMENT,jim carrey blasts castrato adam schiff democra...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
3,ENTERTAINMENT,julianna margulies uses donald trump poop bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...
4,ENTERTAINMENT,morgan freeman devastated sexual harassment cl...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...
...,...,...,...,...,...,...
66619,WELLNESS,sleep library soothing books bedtime,,https://www.huffingtonpost.comhttp://www.oprah...,2012-01-28,The Sleep Library: 11 Soothing Books For Bedtime
66620,WELLNESS,benefits caring pet,"Rita Altman, R.N., Contributor\nSenior Vice Pr...",https://www.huffingtonpost.com/entry/pets-seni...,2012-01-28,The Benefits of Caring for a Pet
66621,WELLNESS,beginning surprising advice centenarian,"Ellie Knaus, Contributor\nAtomic Moms Podcast ...",https://www.huffingtonpost.com/entry/life-tips...,2012-01-28,This Is Only the Beginning: Surprising Advice ...
66622,ENTERTAINMENT,sundance ice shades american race cinema,"Courtney Garcia, Contributor\nI tell stories a...",https://www.huffingtonpost.com/entry/sundance-...,2012-01-28,"Sundance, Ice-T, and Shades of the American Ra..."


## HAPAXES

- Palabras que aparecen solo una vez

In [31]:
corpus=" ".join(df["t_text"].values)

In [32]:
corpus



In [33]:
corpus.split()

['smith',
 'joins',
 'diplo',
 'nicky',
 'jam',
 'world',
 'cup',
 'official',
 'song',
 'hugh',
 'grant',
 'marries',
 'first',
 'time',
 'age',
 'jim',
 'carrey',
 'blasts',
 'castrato',
 'adam',
 'schiff',
 'democrats',
 'new',
 'artwork',
 'julianna',
 'margulies',
 'uses',
 'donald',
 'trump',
 'poop',
 'bags',
 'pick',
 'dog',
 'morgan',
 'freeman',
 'devastated',
 'sexual',
 'harassment',
 'claims',
 'could',
 'undermine',
 'legacy',
 'donald',
 'trump',
 'lovin',
 'new',
 'mcdonald',
 'jingle',
 'tonight',
 'show',
 'bit',
 'watch',
 'amazon',
 'prime',
 'thats',
 'new',
 'week',
 'mike',
 'myers',
 'reveals',
 'like',
 'fourth',
 'austin',
 'powers',
 'film',
 'watch',
 'hulu',
 'thats',
 'new',
 'week',
 'justin',
 'timberlake',
 'visits',
 'texas',
 'school',
 'shooting',
 'victims',
 'trump',
 'crackdown',
 'immigrant',
 'parents',
 'puts',
 'kids',
 'already',
 'strained',
 'system',
 'trump',
 'son',
 'concerned',
 'fbi',
 'obtained',
 'wiretaps',
 'putin',
 'ally',
 'met

In [34]:
fdist=nltk.FreqDist(corpus.split())

In [35]:
fdist

FreqDist({'trump': 9329, 'donald': 3357, 'new': 2915, 'says': 1904, 'clinton': 1788, 'gop': 1725, 'obama': 1690, 'health': 1440, 'hillary': 1347, 'house': 1295, ...})

In [39]:
pd.DataFrame(fdist.items()).sort_values(by=[1])

Unnamed: 0,0,1
29835,unsuitable,1
21131,fiorinas,1
9808,scalpel,1
21130,porsha,1
9810,dividend,1
...,...,...
263,clinton,1788
205,says,1904
22,new,2915
27,donald,3357


In [40]:
list_hapaxes=fdist.hapaxes()

In [None]:
len

In [42]:
len(list_hapaxes)

11790

In [41]:
list_hapaxes

['castrato',
 'lovin',
 'wiretaps',
 'putstarwarsinotherfilms',
 'menaced',
 'puncture',
 'cheek',
 'prostitute',
 'spygate',
 'moses',
 'revising',
 'brynn',
 'cartelli',
 'lourdes',
 'nobu',
 'mcgrath',
 'hunchback',
 'bothers',
 'mcbath',
 'poppe',
 'pansexuality',
 'grievance',
 'commemorative',
 'giulianis',
 'sinkhole',
 'aeroflot',
 'inconvenient',
 'storage',
 'conjured',
 'queue',
 'jarring',
 'unwitting',
 'godmothers',
 'archetypes',
 'myanmar',
 'kathryn',
 'kelton',
 'shotgun',
 'argento',
 'ritalin',
 'tish',
 'cedric',
 'emissary',
 'smallville',
 'yanny',
 'laurel',
 'markles',
 'ranted',
 'disciplinary',
 'happytime',
 'bios',
 'wilkie',
 'treasuries',
 'pansexual',
 'nev',
 'schulman',
 'warwick',
 'mta',
 'input',
 'financials',
 'piscotty',
 'bereavement',
 'wylie',
 'greased',
 'moonwalked',
 'hartogensis',
 'infuriated',
 'raptor',
 'dripping',
 'researched',
 'scoundrel',
 'preteen',
 'nuttiest',
 'avenatti',
 'pounce',
 'seann',
 'mvp',
 'stepmom',
 'reformist',

In [43]:
len(corpus.split(" "))

451345

In [44]:
len(list_hapaxes)

11790

In [45]:
df.shape

(66624, 6)

In [46]:
df

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori
0,ENTERTAINMENT,smith joins diplo nicky jam world cup official...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...
1,ENTERTAINMENT,hugh grant marries first time age,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 57
2,ENTERTAINMENT,jim carrey blasts castrato adam schiff democra...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
3,ENTERTAINMENT,julianna margulies uses donald trump poop bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...
4,ENTERTAINMENT,morgan freeman devastated sexual harassment cl...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...
...,...,...,...,...,...,...
66619,WELLNESS,sleep library soothing books bedtime,,https://www.huffingtonpost.comhttp://www.oprah...,2012-01-28,The Sleep Library: 11 Soothing Books For Bedtime
66620,WELLNESS,benefits caring pet,"Rita Altman, R.N., Contributor\nSenior Vice Pr...",https://www.huffingtonpost.com/entry/pets-seni...,2012-01-28,The Benefits of Caring for a Pet
66621,WELLNESS,beginning surprising advice centenarian,"Ellie Knaus, Contributor\nAtomic Moms Podcast ...",https://www.huffingtonpost.com/entry/life-tips...,2012-01-28,This Is Only the Beginning: Surprising Advice ...
66622,ENTERTAINMENT,sundance ice shades american race cinema,"Courtney Garcia, Contributor\nI tell stories a...",https://www.huffingtonpost.com/entry/sundance-...,2012-01-28,"Sundance, Ice-T, and Shades of the American Ra..."


In [47]:
df["t_text"]=df["t_text"].map(lambda text:" ".join([x for x in text.split(" ") if x not in list_hapaxes]))

In [48]:
df

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori
0,ENTERTAINMENT,smith joins diplo nicky jam world cup official...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...
1,ENTERTAINMENT,hugh grant marries first time age,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 57
2,ENTERTAINMENT,jim carrey blasts adam schiff democrats new ar...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
3,ENTERTAINMENT,julianna margulies uses donald trump poop bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...
4,ENTERTAINMENT,morgan freeman devastated sexual harassment cl...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...
...,...,...,...,...,...,...
66619,WELLNESS,sleep library soothing books bedtime,,https://www.huffingtonpost.comhttp://www.oprah...,2012-01-28,The Sleep Library: 11 Soothing Books For Bedtime
66620,WELLNESS,benefits caring pet,"Rita Altman, R.N., Contributor\nSenior Vice Pr...",https://www.huffingtonpost.com/entry/pets-seni...,2012-01-28,The Benefits of Caring for a Pet
66621,WELLNESS,beginning surprising advice centenarian,"Ellie Knaus, Contributor\nAtomic Moms Podcast ...",https://www.huffingtonpost.com/entry/life-tips...,2012-01-28,This Is Only the Beginning: Surprising Advice ...
66622,ENTERTAINMENT,sundance ice shades american race cinema,"Courtney Garcia, Contributor\nI tell stories a...",https://www.huffingtonpost.com/entry/sundance-...,2012-01-28,"Sundance, Ice-T, and Shades of the American Ra..."


## TOKENIZACION

- La tokenización es el proceso de dividir una cadena en una lista de cadenas (o "tokens").
- Ahora que tenemos todos los tokens útiles

In [49]:
df

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori
0,ENTERTAINMENT,smith joins diplo nicky jam world cup official...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...
1,ENTERTAINMENT,hugh grant marries first time age,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 57
2,ENTERTAINMENT,jim carrey blasts adam schiff democrats new ar...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
3,ENTERTAINMENT,julianna margulies uses donald trump poop bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...
4,ENTERTAINMENT,morgan freeman devastated sexual harassment cl...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...
...,...,...,...,...,...,...
66619,WELLNESS,sleep library soothing books bedtime,,https://www.huffingtonpost.comhttp://www.oprah...,2012-01-28,The Sleep Library: 11 Soothing Books For Bedtime
66620,WELLNESS,benefits caring pet,"Rita Altman, R.N., Contributor\nSenior Vice Pr...",https://www.huffingtonpost.com/entry/pets-seni...,2012-01-28,The Benefits of Caring for a Pet
66621,WELLNESS,beginning surprising advice centenarian,"Ellie Knaus, Contributor\nAtomic Moms Podcast ...",https://www.huffingtonpost.com/entry/life-tips...,2012-01-28,This Is Only the Beginning: Surprising Advice ...
66622,ENTERTAINMENT,sundance ice shades american race cinema,"Courtney Garcia, Contributor\nI tell stories a...",https://www.huffingtonpost.com/entry/sundance-...,2012-01-28,"Sundance, Ice-T, and Shades of the American Ra..."


In [50]:
df["t_text"]=df["t_text"].map(lambda x:x.split())

In [51]:
df

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori
0,ENTERTAINMENT,"[smith, joins, diplo, nicky, jam, world, cup, ...",Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...
1,ENTERTAINMENT,"[hugh, grant, marries, first, time, age]",Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 57
2,ENTERTAINMENT,"[jim, carrey, blasts, adam, schiff, democrats,...",Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
3,ENTERTAINMENT,"[julianna, margulies, uses, donald, trump, poo...",Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...
4,ENTERTAINMENT,"[morgan, freeman, devastated, sexual, harassme...",Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...
...,...,...,...,...,...,...
66619,WELLNESS,"[sleep, library, soothing, books, bedtime]",,https://www.huffingtonpost.comhttp://www.oprah...,2012-01-28,The Sleep Library: 11 Soothing Books For Bedtime
66620,WELLNESS,"[benefits, caring, pet]","Rita Altman, R.N., Contributor\nSenior Vice Pr...",https://www.huffingtonpost.com/entry/pets-seni...,2012-01-28,The Benefits of Caring for a Pet
66621,WELLNESS,"[beginning, surprising, advice, centenarian]","Ellie Knaus, Contributor\nAtomic Moms Podcast ...",https://www.huffingtonpost.com/entry/life-tips...,2012-01-28,This Is Only the Beginning: Surprising Advice ...
66622,ENTERTAINMENT,"[sundance, ice, shades, american, race, cinema]","Courtney Garcia, Contributor\nI tell stories a...",https://www.huffingtonpost.com/entry/sundance-...,2012-01-28,"Sundance, Ice-T, and Shades of the American Ra..."


## Stemming and Lemmatization

- Tanto la derivación como la lematización generan la forma raíz de las palabras. La diferencia es que la raíz puede no ser una palabra real, mientras que lema es una palabra del idioma real (también la raíz suele ser más rápida). Ambos algoritmos son proporcionados por NLTK

- Tenga en cuenta que no debe aplicar tanto la derivación como la lematización.

In [52]:
text=df.iloc[0]["t_text"]

In [53]:
text

['smith', 'joins', 'diplo', 'nicky', 'jam', 'world', 'cup', 'official', 'song']

In [54]:
text=" ".join([x for x in text if x not in list_hapaxes])

In [55]:
text

'smith joins diplo nicky jam world cup official song'

In [56]:
text=text.split(" ")

In [57]:
text

['smith', 'joins', 'diplo', 'nicky', 'jam', 'world', 'cup', 'official', 'song']

In [60]:
print("--- stemming ---")
ps = nltk.stem.porter.PorterStemmer()

--- stemming ---


In [61]:
ps.stem("joins")

'join'

In [59]:
print([ps.stem(word) for word in text])

['smith', 'join', 'diplo', 'nicki', 'jam', 'world', 'cup', 'offici', 'song']


In [64]:
text

['smith', 'joins', 'diplo', 'nicky', 'jam', 'world', 'cup', 'official', 'song']

In [68]:
#import nltk
#nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /home/carla/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [69]:
print("--- lemmatisation ---")
lem = nltk.stem.wordnet.WordNetLemmatizer()
print([lem.lemmatize(word) for word in text])

--- lemmatisation ---
['smith', 'join', 'diplo', 'nicky', 'jam', 'world', 'cup', 'official', 'song']


In [70]:
ps=nltk.stem.porter.PorterStemmer()

In [71]:
df["t_text"]=df["t_text"].map(lambda text:[lem.lemmatize(word) for word in text])

In [72]:
df.head()

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori
0,ENTERTAINMENT,"[smith, join, diplo, nicky, jam, world, cup, o...",Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...
1,ENTERTAINMENT,"[hugh, grant, marries, first, time, age]",Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 57
2,ENTERTAINMENT,"[jim, carrey, blast, adam, schiff, democrat, n...",Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
3,ENTERTAINMENT,"[julianna, margulies, us, donald, trump, poop,...",Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...
4,ENTERTAINMENT,"[morgan, freeman, devastated, sexual, harassme...",Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...


## Creación de caracteristicas

### LONGITUD

- Análisis de longitud
- Es importante echar un vistazo a la longitud del texto porque es un cálculo sencillo que puede brindar una gran cantidad de información valiosa. Quizás, por ejemplo, tengamos la suerte de descubrir que una categoría es sistemáticamente más larga que otra y la longitud sería simplemente la única característica necesaria para construir el modelo. 
- Hay varias medidas de longitud para los datos de texto. Eejemplos:

In [73]:
#NUMERO DE PALABRAS
df['c_numero_palabras'] = df["t_text"].apply(lambda x: len(x))

In [74]:
#NUMERO DE LETRAS POR SENTENCIA
df['c_numero_letras'] =df["t_text"].map(lambda x: sum(len(word) for word in x))

In [75]:
#LONGITUD PROMEDIO DE PALABRAS
df['c_long_pro_pala'] = df['c_numero_letras'] / df['c_numero_palabras']

In [76]:
df

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori,c_numero_palabras,c_numero_letras,c_long_pro_pala
0,ENTERTAINMENT,"[smith, join, diplo, nicky, jam, world, cup, o...",Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,9,42,4.666667
1,ENTERTAINMENT,"[hugh, grant, marries, first, time, age]",Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 57,6,28,4.666667
2,ENTERTAINMENT,"[jim, carrey, blast, adam, schiff, democrat, n...",Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,8,42,5.250000
3,ENTERTAINMENT,"[julianna, margulies, us, donald, trump, poop,...",Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,9,44,4.888889
4,ENTERTAINMENT,"[morgan, freeman, devastated, sexual, harassme...",Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...,9,64,7.111111
...,...,...,...,...,...,...,...,...,...
66619,WELLNESS,"[sleep, library, soothing, book, bedtime]",,https://www.huffingtonpost.comhttp://www.oprah...,2012-01-28,The Sleep Library: 11 Soothing Books For Bedtime,5,31,6.200000
66620,WELLNESS,"[benefit, caring, pet]","Rita Altman, R.N., Contributor\nSenior Vice Pr...",https://www.huffingtonpost.com/entry/pets-seni...,2012-01-28,The Benefits of Caring for a Pet,3,16,5.333333
66621,WELLNESS,"[beginning, surprising, advice, centenarian]","Ellie Knaus, Contributor\nAtomic Moms Podcast ...",https://www.huffingtonpost.com/entry/life-tips...,2012-01-28,This Is Only the Beginning: Surprising Advice ...,4,36,9.000000
66622,ENTERTAINMENT,"[sundance, ice, shade, american, race, cinema]","Courtney Garcia, Contributor\nI tell stories a...",https://www.huffingtonpost.com/entry/sundance-...,2012-01-28,"Sundance, Ice-T, and Shades of the American Ra...",6,34,5.666667


### ANALIS DE SENTIMIENTOS

- Análisis de los sentimientos
- El análisis de sentimientos es la representación de emociones subjetivas de datos de texto a través de números o clases.
- El mejor enfoque sería entrenar su propio modelo de sentimiento que se ajuste correctamente a sus datos. Cuando no hay suficiente tiempo o datos para eso, se pueden usar modelos previamente entrenados, como Textblob y Vader. Textblob, construido sobre NLTK, es uno de los más populares,
- Valores entre -1 y 1 , donde -1 e negativo y 1 positivo

In [77]:
#pip install textblob
from textblob import TextBlob

In [79]:
text=df.iloc[0]["t_text"]

In [80]:
text=" ".join(text)

In [81]:
text

'smith join diplo nicky jam world cup official song'

In [82]:
TextBlob(text)

TextBlob("smith join diplo nicky jam world cup official song")

In [84]:
TextBlob(text).sentiment.polarity

0.0

In [85]:
df.head(11)

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori,c_numero_palabras,c_numero_letras,c_long_pro_pala
0,ENTERTAINMENT,"[smith, join, diplo, nicky, jam, world, cup, o...",Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,9,42,4.666667
1,ENTERTAINMENT,"[hugh, grant, marries, first, time, age]",Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 57,6,28,4.666667
2,ENTERTAINMENT,"[jim, carrey, blast, adam, schiff, democrat, n...",Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,8,42,5.25
3,ENTERTAINMENT,"[julianna, margulies, us, donald, trump, poop,...",Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,9,44,4.888889
4,ENTERTAINMENT,"[morgan, freeman, devastated, sexual, harassme...",Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...,9,64,7.111111
5,ENTERTAINMENT,"[donald, trump, new, mcdonald, jingle, tonight...",Ron Dicker,https://www.huffingtonpost.com/entry/donald-tr...,2018-05-26,Donald Trump Is Lovin' New McDonald's Jingle I...,8,42,5.25
6,ENTERTAINMENT,"[watch, amazon, prime, thats, new, week]",Todd Van Luling,https://www.huffingtonpost.com/entry/amazon-pr...,2018-05-26,What To Watch On Amazon Prime That’s New This ...,6,28,4.666667
7,ENTERTAINMENT,"[mike, myers, reveals, like, fourth, austin, p...",Andy McDonald,https://www.huffingtonpost.com/entry/mike-myer...,2018-05-26,Mike Myers Reveals He'd 'Like To' Do A Fourth ...,8,41,5.125
8,ENTERTAINMENT,"[watch, hulu, thats, new, week]",Todd Van Luling,https://www.huffingtonpost.com/entry/hulu-what...,2018-05-26,What To Watch On Hulu That’s New This Week,5,21,4.2
9,ENTERTAINMENT,"[justin, timberlake, visit, texas, school, sho...",Sebastian Murdock,https://www.huffingtonpost.com/entry/justin-ti...,2018-05-26,Justin Timberlake Visits Texas School Shooting...,7,46,6.571429


In [86]:
df["t_text"][:20].map(lambda x:TextBlob(" ".join(x)).sentiment.polarity)

0     0.000000
1     0.250000
2     0.136364
3     0.000000
4     0.500000
5     0.136364
6     0.136364
7     0.000000
8     0.136364
9    -0.075000
10    0.000000
11    0.000000
12    0.500000
13    0.500000
14    0.000000
15    0.250000
16    0.375000
17    0.500000
18    0.000000
19    0.000000
Name: t_text, dtype: float64

In [88]:
df["t_text_ori"][1]

'Hugh Grant Marries For The First Time At Age 57'

In [89]:
df["t_text_ori"][16]

"Trump's Scottish Golf Resort Pays Women Significantly Less Than Men: Report"

In [90]:
df["t_text_ori"][2]

"Jim Carrey Blasts 'Castrato' Adam Schiff And Democrats In New Artwork"

In [91]:
df["t_text_ori"][9]

'Justin Timberlake Visits Texas School Shooting Victims'

In [79]:
#https://pypi.org/project/sentiment-analysis-spanish/

In [92]:
df

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori,c_numero_palabras,c_numero_letras,c_long_pro_pala
0,ENTERTAINMENT,"[smith, join, diplo, nicky, jam, world, cup, o...",Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,9,42,4.666667
1,ENTERTAINMENT,"[hugh, grant, marries, first, time, age]",Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 57,6,28,4.666667
2,ENTERTAINMENT,"[jim, carrey, blast, adam, schiff, democrat, n...",Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,8,42,5.250000
3,ENTERTAINMENT,"[julianna, margulies, us, donald, trump, poop,...",Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,9,44,4.888889
4,ENTERTAINMENT,"[morgan, freeman, devastated, sexual, harassme...",Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...,9,64,7.111111
...,...,...,...,...,...,...,...,...,...
66619,WELLNESS,"[sleep, library, soothing, book, bedtime]",,https://www.huffingtonpost.comhttp://www.oprah...,2012-01-28,The Sleep Library: 11 Soothing Books For Bedtime,5,31,6.200000
66620,WELLNESS,"[benefit, caring, pet]","Rita Altman, R.N., Contributor\nSenior Vice Pr...",https://www.huffingtonpost.com/entry/pets-seni...,2012-01-28,The Benefits of Caring for a Pet,3,16,5.333333
66621,WELLNESS,"[beginning, surprising, advice, centenarian]","Ellie Knaus, Contributor\nAtomic Moms Podcast ...",https://www.huffingtonpost.com/entry/life-tips...,2012-01-28,This Is Only the Beginning: Surprising Advice ...,4,36,9.000000
66622,ENTERTAINMENT,"[sundance, ice, shade, american, race, cinema]","Courtney Garcia, Contributor\nI tell stories a...",https://www.huffingtonpost.com/entry/sundance-...,2012-01-28,"Sundance, Ice-T, and Shades of the American Ra...",6,34,5.666667


### Reconocimiento de entidad designada

- NER (reconocimiento de entidad con nombre) es el proceso para etiquetar entidades con nombre mencionadas en texto no estructurado con categorías predefinidas como nombres de personas, organizaciones, ubicaciones, expresiones de tiempo, cantidades, etc.
https://spacy.io/usage/spacy-101

In [94]:
#pip install spacy
import spacy
from collections import Counter

In [96]:
#python3 -m spacy download en_core_web_sm
ner = spacy.load('en_core_web_sm')

txt = df["t_text_ori"].iloc[0]


In [98]:
text

'smith join diplo nicky jam world cup official song'

In [99]:
txt

"Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song"

In [100]:
doc = ner(text)
spacy.displacy.render(doc, style="ent")

In [102]:
doc = ner(txt)

spacy.displacy.render(doc, style="ent")

In [103]:
for token in doc.ents:
    print(token)
    print(token.label_)

Will Smith
PERSON
Diplo
PERSON
Nicky Jam
PERSON
The 2018 World Cup's
EVENT


In [104]:
aux=df[:10]

In [105]:
aux["docs_ner"]=aux["t_text_ori"].map(lambda x:ner(x))

In [106]:
from collections import Counter

In [107]:
aux["ner"]=aux["docs_ner"].map(lambda x:[y for y in x.ents])

In [109]:
aux["ner"]

0    [(Will, Smith), (Diplo), (Nicky, Jam), (The, 2...
1                                 [(First), (Age, 57)]
2    [(Jim, Carrey, Blasts, '), (Democrats), (New, ...
3                [(Julianna, Margulies, Uses, Donald)]
4                               [(Morgan, Freeman, ')]
5    [(Donald, Trump), (Jingle, In, '), (Tonight, S...
6                                           [(Amazon)]
7                             [(Mike, Myers, Reveals)]
8                                                   []
9    [(Justin, Timberlake, Visits, Texas, School, S...
Name: ner, dtype: object

In [110]:
Counter([1,1,1,2])

Counter({1: 3, 2: 1})

In [111]:
aux["aux"]=aux["docs_ner"].map(lambda x:Counter([y.label_ for y in x.ents]))

In [113]:
aux

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori,c_numero_palabras,c_numero_letras,c_long_pro_pala,docs_ner,ner,aux
0,ENTERTAINMENT,"[smith, join, diplo, nicky, jam, world, cup, o...",Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,9,42,4.666667,"(Will, Smith, Joins, Diplo, And, Nicky, Jam, F...","[(Will, Smith), (Diplo), (Nicky, Jam), (The, 2...","{'PERSON': 3, 'EVENT': 1}"
1,ENTERTAINMENT,"[hugh, grant, marries, first, time, age]",Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 57,6,28,4.666667,"(Hugh, Grant, Marries, For, The, First, Time, ...","[(First), (Age, 57)]","{'ORDINAL': 1, 'DATE': 1}"
2,ENTERTAINMENT,"[jim, carrey, blast, adam, schiff, democrat, n...",Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,8,42,5.25,"(Jim, Carrey, Blasts, ', Castrato, ', Adam, Sc...","[(Jim, Carrey, Blasts, '), (Democrats), (New, ...","{'PERSON': 1, 'NORP': 1, 'GPE': 1}"
3,ENTERTAINMENT,"[julianna, margulies, us, donald, trump, poop,...",Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,9,44,4.888889,"(Julianna, Margulies, Uses, Donald, Trump, Poo...","[(Julianna, Margulies, Uses, Donald)]",{'PERSON': 1}
4,ENTERTAINMENT,"[morgan, freeman, devastated, sexual, harassme...",Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...,9,64,7.111111,"(Morgan, Freeman, ', Devastated, ', That, Sexu...","[(Morgan, Freeman, ')]",{'ORG': 1}
5,ENTERTAINMENT,"[donald, trump, new, mcdonald, jingle, tonight...",Ron Dicker,https://www.huffingtonpost.com/entry/donald-tr...,2018-05-26,Donald Trump Is Lovin' New McDonald's Jingle I...,8,42,5.25,"(Donald, Trump, Is, Lovin', New, McDonald, 's,...","[(Donald, Trump), (Jingle, In, '), (Tonight, S...","{'PERSON': 1, 'ORG': 1, 'WORK_OF_ART': 1}"
6,ENTERTAINMENT,"[watch, amazon, prime, thats, new, week]",Todd Van Luling,https://www.huffingtonpost.com/entry/amazon-pr...,2018-05-26,What To Watch On Amazon Prime That’s New This ...,6,28,4.666667,"(What, To, Watch, On, Amazon, Prime, That, ’s,...",[(Amazon)],{'ORG': 1}
7,ENTERTAINMENT,"[mike, myers, reveals, like, fourth, austin, p...",Andy McDonald,https://www.huffingtonpost.com/entry/mike-myer...,2018-05-26,Mike Myers Reveals He'd 'Like To' Do A Fourth ...,8,41,5.125,"(Mike, Myers, Reveals, He, 'd, ', Like, To, ',...","[(Mike, Myers, Reveals)]",{'PERSON': 1}
8,ENTERTAINMENT,"[watch, hulu, thats, new, week]",Todd Van Luling,https://www.huffingtonpost.com/entry/hulu-what...,2018-05-26,What To Watch On Hulu That’s New This Week,5,21,4.2,"(What, To, Watch, On, Hulu, That, ’s, New, Thi...",[],{}
9,ENTERTAINMENT,"[justin, timberlake, visit, texas, school, sho...",Sebastian Murdock,https://www.huffingtonpost.com/entry/justin-ti...,2018-05-26,Justin Timberlake Visits Texas School Shooting...,7,46,6.571429,"(Justin, Timberlake, Visits, Texas, School, Sh...","[(Justin, Timberlake, Visits, Texas, School, S...",{'ORG': 1}


In [112]:
aux["aux"]

0                    {'PERSON': 3, 'EVENT': 1}
1                    {'ORDINAL': 1, 'DATE': 1}
2           {'PERSON': 1, 'NORP': 1, 'GPE': 1}
3                                {'PERSON': 1}
4                                   {'ORG': 1}
5    {'PERSON': 1, 'ORG': 1, 'WORK_OF_ART': 1}
6                                   {'ORG': 1}
7                                {'PERSON': 1}
8                                           {}
9                                   {'ORG': 1}
Name: aux, dtype: object

In [114]:
aux.iloc[2]["ner"]

[Jim Carrey Blasts ', Democrats, New Artwork]

In [115]:
aux.iloc[2]["t_text_ori"]

"Jim Carrey Blasts 'Castrato' Adam Schiff And Democrats In New Artwork"

In [116]:
aux.iloc[2]["aux"]

Counter({'PERSON': 1, 'NORP': 1, 'GPE': 1})

In [118]:
aux

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori,c_numero_palabras,c_numero_letras,c_long_pro_pala,docs_ner,ner,aux
0,ENTERTAINMENT,"[smith, join, diplo, nicky, jam, world, cup, o...",Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,9,42,4.666667,"(Will, Smith, Joins, Diplo, And, Nicky, Jam, F...","[(Will, Smith), (Diplo), (Nicky, Jam), (The, 2...","{'PERSON': 3, 'EVENT': 1}"
1,ENTERTAINMENT,"[hugh, grant, marries, first, time, age]",Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 57,6,28,4.666667,"(Hugh, Grant, Marries, For, The, First, Time, ...","[(First), (Age, 57)]","{'ORDINAL': 1, 'DATE': 1}"
2,ENTERTAINMENT,"[jim, carrey, blast, adam, schiff, democrat, n...",Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,8,42,5.25,"(Jim, Carrey, Blasts, ', Castrato, ', Adam, Sc...","[(Jim, Carrey, Blasts, '), (Democrats), (New, ...","{'PERSON': 1, 'NORP': 1, 'GPE': 1}"
3,ENTERTAINMENT,"[julianna, margulies, us, donald, trump, poop,...",Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,9,44,4.888889,"(Julianna, Margulies, Uses, Donald, Trump, Poo...","[(Julianna, Margulies, Uses, Donald)]",{'PERSON': 1}
4,ENTERTAINMENT,"[morgan, freeman, devastated, sexual, harassme...",Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...,9,64,7.111111,"(Morgan, Freeman, ', Devastated, ', That, Sexu...","[(Morgan, Freeman, ')]",{'ORG': 1}
5,ENTERTAINMENT,"[donald, trump, new, mcdonald, jingle, tonight...",Ron Dicker,https://www.huffingtonpost.com/entry/donald-tr...,2018-05-26,Donald Trump Is Lovin' New McDonald's Jingle I...,8,42,5.25,"(Donald, Trump, Is, Lovin', New, McDonald, 's,...","[(Donald, Trump), (Jingle, In, '), (Tonight, S...","{'PERSON': 1, 'ORG': 1, 'WORK_OF_ART': 1}"
6,ENTERTAINMENT,"[watch, amazon, prime, thats, new, week]",Todd Van Luling,https://www.huffingtonpost.com/entry/amazon-pr...,2018-05-26,What To Watch On Amazon Prime That’s New This ...,6,28,4.666667,"(What, To, Watch, On, Amazon, Prime, That, ’s,...",[(Amazon)],{'ORG': 1}
7,ENTERTAINMENT,"[mike, myers, reveals, like, fourth, austin, p...",Andy McDonald,https://www.huffingtonpost.com/entry/mike-myer...,2018-05-26,Mike Myers Reveals He'd 'Like To' Do A Fourth ...,8,41,5.125,"(Mike, Myers, Reveals, He, 'd, ', Like, To, ',...","[(Mike, Myers, Reveals)]",{'PERSON': 1}
8,ENTERTAINMENT,"[watch, hulu, thats, new, week]",Todd Van Luling,https://www.huffingtonpost.com/entry/hulu-what...,2018-05-26,What To Watch On Hulu That’s New This Week,5,21,4.2,"(What, To, Watch, On, Hulu, That, ’s, New, Thi...",[],{}
9,ENTERTAINMENT,"[justin, timberlake, visit, texas, school, sho...",Sebastian Murdock,https://www.huffingtonpost.com/entry/justin-ti...,2018-05-26,Justin Timberlake Visits Texas School Shooting...,7,46,6.571429,"(Justin, Timberlake, Visits, Texas, School, Sh...","[(Justin, Timberlake, Visits, Texas, School, S...",{'ORG': 1}


In [119]:
aux["aux"].apply(pd.Series)

Unnamed: 0,PERSON,EVENT,ORDINAL,DATE,NORP,GPE,ORG,WORK_OF_ART
0,3.0,1.0,,,,,,
1,,,1.0,1.0,,,,
2,1.0,,,,1.0,1.0,,
3,1.0,,,,,,,
4,,,,,,,1.0,
5,1.0,,,,,,1.0,1.0
6,,,,,,,1.0,
7,1.0,,,,,,,
8,,,,,,,,
9,,,,,,,1.0,


In [120]:
aux=pd.concat([aux,aux["aux"].apply(pd.Series).fillna(0)],axis=1)

In [121]:
aux

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori,c_numero_palabras,c_numero_letras,c_long_pro_pala,docs_ner,ner,aux,PERSON,EVENT,ORDINAL,DATE,NORP,GPE,ORG,WORK_OF_ART
0,ENTERTAINMENT,"[smith, join, diplo, nicky, jam, world, cup, o...",Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,9,42,4.666667,"(Will, Smith, Joins, Diplo, And, Nicky, Jam, F...","[(Will, Smith), (Diplo), (Nicky, Jam), (The, 2...","{'PERSON': 3, 'EVENT': 1}",3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ENTERTAINMENT,"[hugh, grant, marries, first, time, age]",Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 57,6,28,4.666667,"(Hugh, Grant, Marries, For, The, First, Time, ...","[(First), (Age, 57)]","{'ORDINAL': 1, 'DATE': 1}",0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,ENTERTAINMENT,"[jim, carrey, blast, adam, schiff, democrat, n...",Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,8,42,5.25,"(Jim, Carrey, Blasts, ', Castrato, ', Adam, Sc...","[(Jim, Carrey, Blasts, '), (Democrats), (New, ...","{'PERSON': 1, 'NORP': 1, 'GPE': 1}",1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,ENTERTAINMENT,"[julianna, margulies, us, donald, trump, poop,...",Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,9,44,4.888889,"(Julianna, Margulies, Uses, Donald, Trump, Poo...","[(Julianna, Margulies, Uses, Donald)]",{'PERSON': 1},1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ENTERTAINMENT,"[morgan, freeman, devastated, sexual, harassme...",Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...,9,64,7.111111,"(Morgan, Freeman, ', Devastated, ', That, Sexu...","[(Morgan, Freeman, ')]",{'ORG': 1},0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,ENTERTAINMENT,"[donald, trump, new, mcdonald, jingle, tonight...",Ron Dicker,https://www.huffingtonpost.com/entry/donald-tr...,2018-05-26,Donald Trump Is Lovin' New McDonald's Jingle I...,8,42,5.25,"(Donald, Trump, Is, Lovin', New, McDonald, 's,...","[(Donald, Trump), (Jingle, In, '), (Tonight, S...","{'PERSON': 1, 'ORG': 1, 'WORK_OF_ART': 1}",1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
6,ENTERTAINMENT,"[watch, amazon, prime, thats, new, week]",Todd Van Luling,https://www.huffingtonpost.com/entry/amazon-pr...,2018-05-26,What To Watch On Amazon Prime That’s New This ...,6,28,4.666667,"(What, To, Watch, On, Amazon, Prime, That, ’s,...",[(Amazon)],{'ORG': 1},0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,ENTERTAINMENT,"[mike, myers, reveals, like, fourth, austin, p...",Andy McDonald,https://www.huffingtonpost.com/entry/mike-myer...,2018-05-26,Mike Myers Reveals He'd 'Like To' Do A Fourth ...,8,41,5.125,"(Mike, Myers, Reveals, He, 'd, ', Like, To, ',...","[(Mike, Myers, Reveals)]",{'PERSON': 1},1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,ENTERTAINMENT,"[watch, hulu, thats, new, week]",Todd Van Luling,https://www.huffingtonpost.com/entry/hulu-what...,2018-05-26,What To Watch On Hulu That’s New This Week,5,21,4.2,"(What, To, Watch, On, Hulu, That, ’s, New, Thi...",[],{},0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,ENTERTAINMENT,"[justin, timberlake, visit, texas, school, sho...",Sebastian Murdock,https://www.huffingtonpost.com/entry/justin-ti...,2018-05-26,Justin Timberlake Visits Texas School Shooting...,7,46,6.571429,"(Justin, Timberlake, Visits, Texas, School, Sh...","[(Justin, Timberlake, Visits, Texas, School, S...",{'ORG': 1},0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


NORP - Nacionalidades o grupos religiosos o políticos
PERSON, NORP (nationalities, religious and political groups), FAC (buildings, airports etc.), ORG (organizations), GPE (countries, cities etc.), LOC (mountain ranges, water bodies etc.), PRODUCT (products), EVENT (event names), WORK_OF_ART (books, song titles), LAW (legal document titles), LANGUAGE (named languages), DATE, TIME, PERCENT, MONEY, QUANTITY, ORDINAL and CARDINAL.

In [123]:

aux

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori,c_numero_palabras,c_numero_letras,c_long_pro_pala,docs_ner,aux,ner,PERSON,DATE,EVENT,NORP,GPE,ORG,WORK_OF_ART
0,ENTERTAINMENT,"[smith, join, diplo, nicky, jam, world, cup, o...",Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,9,42,4.666667,"(Will, Smith, Joins, Diplo, And, Nicky, Jam, F...","{'PERSON': 2, 'DATE': 1, 'EVENT': 1}","[(Smith, Joins, Diplo), (Nicky, Jam), (2018), ...",2.0,1.0,1.0,0.0,0.0,0.0,0.0
1,ENTERTAINMENT,"[hugh, grant, marries, first, time, age]",Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,2018-05-26,Hugh Grant Marries For The First Time At Age 57,6,28,4.666667,"(Hugh, Grant, Marries, For, The, First, Time, ...",{'DATE': 1},"[(Age, 57)]",0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,ENTERTAINMENT,"[jim, carrey, blast, adam, schiff, democrat, n...",Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,8,42,5.25,"(Jim, Carrey, Blasts, ', Castrato, ', Adam, Sc...","{'PERSON': 1, 'NORP': 1, 'GPE': 1}","[(Jim, Carrey), (Democrats), (New, Artwork)]",1.0,0.0,0.0,1.0,1.0,0.0,0.0
3,ENTERTAINMENT,"[julianna, margulies, us, donald, trump, poop,...",Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,9,44,4.888889,"(Julianna, Margulies, Uses, Donald, Trump, Poo...",{'PERSON': 1},"[(Donald, Trump)]",1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ENTERTAINMENT,"[morgan, freeman, devastated, sexual, harassme...",Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...,9,64,7.111111,"(Morgan, Freeman, ', Devastated, ', That, Sexu...",{'ORG': 1},"[(Morgan, Freeman, ', Devastated, ', That, Sex...",0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,ENTERTAINMENT,"[donald, trump, new, mcdonald, jingle, tonight...",Ron Dicker,https://www.huffingtonpost.com/entry/donald-tr...,2018-05-26,Donald Trump Is Lovin' New McDonald's Jingle I...,8,42,5.25,"(Donald, Trump, Is, Lovin', New, McDonald, 's,...","{'PERSON': 1, 'WORK_OF_ART': 1}","[(Donald, Trump), (Tonight, Show, ', Bit)]",1.0,0.0,0.0,0.0,0.0,0.0,1.0
6,ENTERTAINMENT,"[watch, amazon, prime, thats, new, week]",Todd Van Luling,https://www.huffingtonpost.com/entry/amazon-pr...,2018-05-26,What To Watch On Amazon Prime That’s New This ...,6,28,4.666667,"(What, To, Watch, On, Amazon, Prime, That, ’s,...",{},[],0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,ENTERTAINMENT,"[mike, myers, reveals, like, fourth, austin, p...",Andy McDonald,https://www.huffingtonpost.com/entry/mike-myer...,2018-05-26,Mike Myers Reveals He'd 'Like To' Do A Fourth ...,8,41,5.125,"(Mike, Myers, Reveals, He, 'd, ', Like, To, ',...","{'PERSON': 1, 'WORK_OF_ART': 1}","[(Mike, Myers), (A, Fourth, Austin, Powers, Fi...",1.0,0.0,0.0,0.0,0.0,0.0,1.0
8,ENTERTAINMENT,"[watch, hulu, thats, new, week]",Todd Van Luling,https://www.huffingtonpost.com/entry/hulu-what...,2018-05-26,What To Watch On Hulu That’s New This Week,5,21,4.2,"(What, To, Watch, On, Hulu, That, ’s, New, Thi...",{},[],0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,ENTERTAINMENT,"[justin, timberlake, visit, texas, school, sho...",Sebastian Murdock,https://www.huffingtonpost.com/entry/justin-ti...,2018-05-26,Justin Timberlake Visits Texas School Shooting...,7,46,6.571429,"(Justin, Timberlake, Visits, Texas, School, Sh...",{'ORG': 1},"[(Justin, Timberlake, Visits, Texas, School, S...",0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [118]:
a=aux[aux["NORP"].map(lambda x:x!=0)]["docs_ner"].values[0]

KeyError: 'NORP'

In [119]:
for token in a.ents:
    print(token)
    print(token.label_)

NameError: name 'a' is not defined

### COUNT VECTORIZER

- Es el método más utilizado para convertir datos de texto en sus representaciones vectoriales. Es similar a las variables "dummy", en el sentido de que CountVectorizer convierte columnas de texto en matrices donde las columnas son tokens y los valores de celda son recuentos de apariciones de cada token en cada documento. La matriz resultante se conoce como matriz documento-término porque cada fila representará un documento.

- Para utilizar count Vectorizer se debe tener en cuenta que antes se debe generar el conjunto de entranamiento y prueba ya que las columnas creadas se basan en las palabras encontradas en el texto proporcionado

**PARAMETROS RELEVANTES**
- stop_words : CountVectorizer(stop_words="english")#se le indica el idioma a considerar y eliminar la stop words
- min_df : solo incluye palabras que aparecen enval menos el 5% de los documentos del corpus
- max_df : solo incluye palabras que aparecen como máximo en el 80% de los documentos
- ngram_range : - este parámetro toma una tupla donde el límite inferior y superior del rango de n-valores indica el número de n-gramos diferentes que se extraerán. Los N-gramas representan frases, por lo que un valor de uno representaría una palabra, sin embargo, un valor de dos representaría dos palabras juntas. Como puede imaginar, esto ampliará significativamente nuestro conjunto.
- analyzer : Realiza conteos por caracter o por palabra ("char","word")

In [122]:
from sklearn.feature_extraction.text import CountVectorizer

In [123]:
#conjunto de entrenamiento y prueba
train = ['The sky is blue.','The sun is bright.']
test = ['The sun in the sky is bright', 'We can see the shining sun, the bright sun.']

In [141]:
countvectorizer = CountVectorizer(analyzer= 'word',stop_words="english")

In [142]:
train

['The sky is blue.', 'The sun is bright.']

In [143]:
test

['The sun in the sky is bright', 'We can see the shining sun, the bright sun.']

In [144]:
#El objeto aprende de las palabras dentro del conjunto de entrenamiento
countvectorizer.fit(train)

CountVectorizer(stop_words='english')

In [145]:
#Una vez entrenado podemos usarlo para transformar nueva información
count_train=countvectorizer.transform(train)
count_test=countvectorizer.transform(test)

In [146]:
#fit_transform
#fit
#transform

In [147]:
train

['The sky is blue.', 'The sun is bright.']

In [148]:
#Obtenemos el nombre de las columnas
columns=countvectorizer.get_feature_names()
columns

['blue', 'bright', 'sky', 'sun']

In [149]:
#Generamos un dataframe con el train
pd.DataFrame(data=count_train.toarray(),columns=columns)

Unnamed: 0,blue,bright,sky,sun
0,1,0,1,0
1,0,1,0,1


In [150]:
#Generamos un dataframe con el test
pd.DataFrame(data=count_test.toarray(),columns=columns)

Unnamed: 0,blue,bright,sky,sun
0,0,1,1,1
1,0,1,0,2


In [151]:
test

['The sun in the sky is bright', 'We can see the shining sun, the bright sun.']

In [152]:
df["t_text"]=df["t_text"].map(lambda x:" ".join(x))

In [153]:
from sklearn.model_selection import train_test_split

In [164]:
df[["t_text"]][:100]

Unnamed: 0,t_text
0,smith join diplo nicky jam world cup official ...
1,hugh grant marries first time age
2,jim carrey blast adam schiff democrat new artwork
3,julianna margulies us donald trump poop bag pi...
4,morgan freeman devastated sexual harassment cl...
...,...
95,ariana grande shuts fan blaming heartbreaking ...
96,parent slam show dog scene grooming child sexu...
97,dave grohl want apologize world massive jerk t...
98,emilia clarke introduced prince william pretty...


In [154]:
#EJEMPLO CON 100 REGISTROS
X_train,X_test=train_test_split(df[:100],test_size=0.20,random_state=42)

In [155]:
X_test

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori,c_numero_palabras,c_numero_letras,c_long_pro_pala
83,ENTERTAINMENT,becomes youngest ever winner voice,Ron Dicker,https://www.huffingtonpost.com/entry/brynn-car...,2018-05-23,Brynn Cartelli Becomes Youngest-Ever Winner Of...,5,30,6.0
53,POLITICS,trump lawyer attended doj meeting confidential...,Igor Bobic and Ryan J. Reilly,https://www.huffingtonpost.com/entry/doj-meeti...,2018-05-24,Trump Lawyer Attended DOJ Meeting On Confident...,8,53,6.625
70,POLITICS,twitter critic mercilessly mock trump cancelin...,David Moye,https://www.huffingtonpost.com/entry/twitter-m...,2018-05-24,Twitter Critics Mercilessly Mock Trump For Can...,8,51,6.375
45,ENTERTAINMENT,infinity war writer try clear marvel problemat...,Andy McDonald,https://www.huffingtonpost.com/entry/infinity-...,2018-05-24,'Infinity War' Writers Try To Clear Up Marvel'...,8,50,6.25
44,ENTERTAINMENT,andy cohen taylor swift katy perry drama,Carly Ledbetter,https://www.huffingtonpost.com/entry/andy-cohe...,2018-05-24,Andy Cohen And Taylor Swift Are Over Their Kat...,7,34,4.857143
39,POLITICS,member far right proud boy twitter user doorstep,Andy Campbell,https://www.huffingtonpost.com/entry/far-right...,2018-05-25,A Member Of The Far-Right Proud Boys Menaced A...,8,41,5.125
22,ENTERTAINMENT,people rattled much nigerian man look like cha...,Jenna Amatulli,https://www.huffingtonpost.com/entry/nigerian-...,2018-05-25,People Are Rattled By How Much This Nigerian M...,9,51,5.666667
80,POLITICS,judge ruled one immigrant right hearing others,Elise Foley,https://www.huffingtonpost.com/entry/immigrant...,2018-05-24,A Judge Ruled One Immigrant Had A Right To A H...,7,40,5.714286
10,POLITICS,trump crackdown immigrant parent put kid alrea...,Elise Foley and Roque Planas,https://www.huffingtonpost.com/entry/immigrant...,2018-05-26,Trump's Crackdown On Immigrant Parents Puts Mo...,9,56,6.222222
0,ENTERTAINMENT,smith join diplo nicky jam world cup official ...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,9,42,4.666667


In [169]:
df.shape

(66624, 9)

In [156]:
vect = CountVectorizer(stop_words="english",ngram_range=(3,5))
vect.fit(X_train["t_text"])
array_train=vect.transform(X_train["t_text"])
array_test=vect.transform(X_test["t_text"])

In [157]:
aux_train=pd.DataFrame(array_train.toarray(),columns=vect.get_feature_names())
aux_test=pd.DataFrame(array_test.toarray(),columns=vect.get_feature_names())

In [158]:
aux_train

Unnamed: 0,abortion amendment landslide,abortion amendment landslide referendum,abortion giving men,abortion giving men property,abortion giving men property right,abortion haunted trump,abortion haunted trump brexit,abrams win democratic,abrams win democratic primary,abrams win democratic primary georgia,...,woody allen accuses mia,woody allen accuses mia farrow,world massive jerk,world massive jerk trump,wrecking ball kid,wrecking ball kid living,wrecking ball kid living room,zinke look reel,zinke look reel critic,zinke look reel critic grand
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
77,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### CORRECTOR DE TEXTO
- https://unipython.com/hunspell-corrector-ortografico-en-python/
- https://medium.com/@hritikattri10/feature-extraction-using-tf-idf-algorithm-44eedb37305e

### TF - IDF VECTORIZER

- Un Tf-idfVectorizer se puede dividir en dos componentes. 
- Primero, la parte tf, que representa la frecuencia del término, y la parte idf, que significa frecuencia inversa del documento. Es un método de ponderación de términos que tiene aplicaciones en la recuperación y agrupación de información.
- Se da un peso para evaluar qué tan importante es una palabra para un documento en un corpus. Veamos cada parte un poco más:
- tf: frecuencia del término: mide la frecuencia con la que aparece un término en un documento. Dado que los documentos pueden tener una longitud diferente, es posible que un término aparezca muchas más veces en documentos más largos que en documentos más cortos. Por lo tanto, la frecuencia de los términos a menudo se divide por la longitud del documento, o el número total de términos en el documento, como una forma de normalización.
- idf: frecuencia inversa del documento: mide la importancia de un término. Al calcular la frecuencia de los términos, todos los términos se consideran igualmente importantes. Sin embargo, ciertos términos, como "la", "de" y "y", pueden aparecer muchas veces pero tienen poca importancia. Por lo tanto, debemos ponderar menos los términos frecuentes, mientras que ampliamos los raros.
- Para volver a enfatizar, un TfidfVectorizer es lo mismo que CountVectorizer, en el sentido de que construye características a partir de tokens, pero va un paso más allá y normaliza los conteos a la frecuencia de ocurrencias en un corpus.

https://www.ceadesc.org/article/how-to-process-textual-data-using-tf-idf-in-python-26fec0/

In [159]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [160]:
vect = TfidfVectorizer()
vect.fit(X_train["t_text"])
array_train= vect.transform(X_train["t_text"])
array_test= vect.transform(X_test["t_text"])

In [161]:
aux_train=pd.DataFrame(array_train.toarray(),columns=vect.get_feature_names())
aux_test=pd.DataFrame(array_train.toarray(),columns=vect.get_feature_names())

In [162]:
aux_train

Unnamed: 0,abortion,abrams,abroad,abruptly,abuse,accuse,accuser,accuses,act,ad,...,worker,world,worthy,would,wrecking,yacht,year,yet,zinke,zone
0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.317386,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.36348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.256282,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77,0.337090,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [163]:
X_train

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori,c_numero_palabras,c_numero_letras,c_long_pro_pala
55,POLITICS,rudy giuliani trump interview mueller get report,S.V. Date,https://www.huffingtonpost.com/entry/trump-giu...,2018-05-24,Rudy Giuliani: Trump Won't Interview With Muel...,7,42,6.000000
88,POLITICS,report michael cohen business partner cooperat...,"Reuters Staff, Reuters",https://www.huffingtonpost.com/entry/report-mi...,2018-05-23,Report: Michael Cohen's Business Partner Coope...,7,54,7.714286
26,ENTERTAINMENT,george takei accuser walk back story drugging ...,Cole Delbyck,https://www.huffingtonpost.com/entry/george-ta...,2018-05-25,George Takei Accuser Walks Back Story Of Drugg...,9,52,5.777778
42,POLITICS,trump new executive order make easier fire fed...,Antonia Blumberg,https://www.huffingtonpost.com/entry/trump-sig...,2018-05-25,Trump's New Executive Orders Make It Easier To...,9,49,5.444444
69,POLITICS,scott pruitt twice introduced anti abortion bi...,Alexander C. Kaufman,https://www.huffingtonpost.com/entry/scott-pru...,2018-05-24,Scott Pruitt Twice Introduced Anti-Abortion Bi...,12,69,5.750000
...,...,...,...,...,...,...,...,...,...
60,ENTERTAINMENT,ariana grande reveals bee tattoo honor manches...,Elyse Wanshel,https://www.huffingtonpost.com/entry/ariana-gr...,2018-05-24,Ariana Grande Reveals Bee Tattoo In Honor Of T...,8,49,6.125000
71,POLITICS,civil right group rip senate bill sexual haras...,Jennifer Bendery,https://www.huffingtonpost.com/entry/civil-rig...,2018-05-24,Civil Rights Groups Rip Senate Bill Revising I...,9,50,5.555556
14,POLITICS,ireland vote repeal abortion amendment landsli...,Laura Bassett,https://www.huffingtonpost.com/entry/results-f...,2018-05-26,Ireland Votes To Repeal Abortion Amendment In ...,7,53,7.571429
92,ENTERTAINMENT,way netflix trick watching show movie,Todd Van Luling,https://www.huffingtonpost.com/entry/netflix-w...,2018-05-23,5 Ways Netflix Tricks You Into Watching More S...,6,32,5.333333


In [164]:
X_train=pd.concat([X_train.reset_index(drop=True),aux_train.reset_index(drop=True)],axis=1)
#Se añaden las nuevas columnas al dataset train base
X_test=pd.concat([X_test.reset_index(drop=True),aux_test.reset_index(drop=True)],axis=1)
#Se añaden las nuevas columnas al dataset test base

In [165]:
X_train

Unnamed: 0,tgt,t_text,t_authors,t_link,d_date,t_text_ori,c_numero_palabras,c_numero_letras,c_long_pro_pala,abortion,...,worker,world,worthy,would,wrecking,yacht,year,yet,zinke,zone
0,POLITICS,rudy giuliani trump interview mueller get report,S.V. Date,https://www.huffingtonpost.com/entry/trump-giu...,2018-05-24,Rudy Giuliani: Trump Won't Interview With Muel...,7,42,6.000000,0.000000,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,POLITICS,report michael cohen business partner cooperat...,"Reuters Staff, Reuters",https://www.huffingtonpost.com/entry/report-mi...,2018-05-23,Report: Michael Cohen's Business Partner Coope...,7,54,7.714286,0.000000,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ENTERTAINMENT,george takei accuser walk back story drugging ...,Cole Delbyck,https://www.huffingtonpost.com/entry/george-ta...,2018-05-25,George Takei Accuser Walks Back Story Of Drugg...,9,52,5.777778,0.000000,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,POLITICS,trump new executive order make easier fire fed...,Antonia Blumberg,https://www.huffingtonpost.com/entry/trump-sig...,2018-05-25,Trump's New Executive Orders Make It Easier To...,9,49,5.444444,0.000000,...,0.36348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,POLITICS,scott pruitt twice introduced anti abortion bi...,Alexander C. Kaufman,https://www.huffingtonpost.com/entry/scott-pru...,2018-05-24,Scott Pruitt Twice Introduced Anti-Abortion Bi...,12,69,5.750000,0.256282,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,ENTERTAINMENT,ariana grande reveals bee tattoo honor manches...,Elyse Wanshel,https://www.huffingtonpost.com/entry/ariana-gr...,2018-05-24,Ariana Grande Reveals Bee Tattoo In Honor Of T...,8,49,6.125000,0.000000,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76,POLITICS,civil right group rip senate bill sexual haras...,Jennifer Bendery,https://www.huffingtonpost.com/entry/civil-rig...,2018-05-24,Civil Rights Groups Rip Senate Bill Revising I...,9,50,5.555556,0.000000,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77,POLITICS,ireland vote repeal abortion amendment landsli...,Laura Bassett,https://www.huffingtonpost.com/entry/results-f...,2018-05-26,Ireland Votes To Repeal Abortion Amendment In ...,7,53,7.571429,0.337090,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78,ENTERTAINMENT,way netflix trick watching show movie,Todd Van Luling,https://www.huffingtonpost.com/entry/netflix-w...,2018-05-23,5 Ways Netflix Tricks You Into Watching More S...,6,32,5.333333,0.000000,...,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [166]:
X_train.describe()

Unnamed: 0,c_numero_palabras,c_numero_letras,c_long_pro_pala,abortion,abrams,abroad,abruptly,abuse,accuse,accuser,...,worker,world,worthy,would,wrecking,yacht,year,yet,zinke,zone
count,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,...,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0
mean,8.275,47.4375,5.782171,0.011544,0.005007,0.004689,0.004667,0.01117,0.004543,0.008357,...,0.004543,0.004635,0.004064,0.008758,0.004466,0.003986,0.004055,0.004781,0.004272,0.004596
std,1.574922,9.403533,0.872326,0.059284,0.044787,0.04194,0.041744,0.057045,0.040632,0.052584,...,0.040638,0.041461,0.036346,0.05504,0.039943,0.035651,0.036265,0.042765,0.03821,0.041109
min,3.0,21.0,4.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7.0,43.0,5.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8.0,48.5,5.651515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,9.0,54.0,6.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,12.0,69.0,10.0,0.33709,0.400587,0.375121,0.373374,0.310365,0.363426,0.351138,...,0.36348,0.370839,0.325085,0.351138,0.357258,0.31887,0.32436,0.382501,0.341757,0.367688
