# LIMPIEZA DE LA BASE DE DATOS

In [2]:
import pandas as pd
import json
from spellchecker import SpellChecker
import sys
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from wordcloud import WordCloud
from nltk.corpus import stopwords
import re
from tqdm import tqdm
from collections import Counter
from nltk.tokenize import TweetTokenizer
import spacy
spacy.cli.download('en_core_web_sm')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Usuario1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Usuario1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
en_stopwords = set(stopwords.words('english'))

In [4]:
# Cargar el diccionario de contracciones desde el archivo JSON
with open('DATOS/contractions.json', 'r') as file:
    contractions_dict = json.load(file)

# Expresión regular para encontrar contracciones
contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

## Inicializar el corrector ortográfico
spell = SpellChecker()

In [5]:
df = pd.read_csv("DATOS/BBDD_combinada_Paula_Maria_Laura_Paula_Ines_Gabi_Marcos.csv", encoding='ISO-8859-1')

In [6]:
df_sinstopwords = df
df_constopwords = df

In [7]:
print("Existen {} documentos duplicados en la columna de 'Titulo'".format(df.duplicated(subset=['Titulo']).sum()))
print("Existen {} documentos duplicados en la columna de 'Ponente'".format(df.duplicated(subset=['Ponente']).sum()))
print("Existen {} documentos duplicados en la columna de 'Contenido'".format(df.duplicated(subset=['Contenido']).sum()))
print("Hay {} valores vacíos en la columna Titulo, {} valores vacíos en la columna Ponente y {} valores vacíos en la columna Contenido".format(
    df.isnull().sum()["Titulo"],
    df.isnull().sum()["Ponente"],
    df.isnull().sum()['Contenido']))

Existen 0 documentos duplicados en la columna de 'Titulo'
Existen 1 documentos duplicados en la columna de 'Ponente'
Existen 0 documentos duplicados en la columna de 'Contenido'
Hay 0 valores vacíos en la columna Titulo, 0 valores vacíos en la columna Ponente y 0 valores vacíos en la columna Contenido


In [8]:
#Eliminar espacios
def eliminar_espacios(text):
    return  " ".join(text.split())

#To lower
def texto_to_lower(text):
    return text.lower()

#Eliminar Stopwords
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def eliminar_stopwords(texto):
    doc = nlp(texto)

    tokens_filtrados = [token.text for token in doc if token.text.lower() not in en_stopwords and token.is_alpha]

    texto_procesado = ' '.join(tokens_filtrados)

    return texto_procesado

#Contracciones. 
def expand_contractions(text, contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

#Corrección ortográfica. 
def correct_text(text):
    corrected_text = []
    for word in text.split():
        corrected_word = spell.correction(word)
        # Si corrected_word es None, usar la palabra original
        if corrected_word is None:
            corrected_word = word
        corrected_text.append(corrected_word)
    return ' '.join(corrected_text)

## LIMPIEZA QUITANDO STOPWORDS

In [10]:
tqdm.pandas()

df_sinstopwords['Contenido_normalizado'] = df_sinstopwords['Contenido'].progress_apply(lambda x: eliminar_stopwords(texto_to_lower(correct_text(expand_contractions(eliminar_espacios(x))))))


100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [27:38<00:00, 23.69s/it]


In [11]:
df_sinstopwords

Unnamed: 0,Titulo,Ponente,Contenido,Contenido_normalizado
0,The power of the vulnerability,Brené Brown,"So, I'll start with this: a couple years ago, ...",start couple years ago event planner called go...
1,A new life for mobile phones,Rajeevan Amirtharajah,okay so today I'm gonna talk about something ...,okay today ism gon na talk something probably ...
2,Strange answer to the psychopath,Jon Ronson,"The story starts: I was at a friend's house, a...",story starts friend house shelf copy dam manua...
3,Media and children,Dimitri Christakis,i'm a pediatrician a researcher and a parent a...,pediatrician researcher parent became things o...
4,Narrative humility,Sayantani DasGupta,my name is Sian tani Dasgupta I'm a medical hu...,name sign tank dasgupta ism medical humanities...
...,...,...,...,...
65,A Growth Mindset for a Creative Mind,Bosse Larsson,i will talk about two things first of all abo...,talk two things first view creativity think se...
66,Exercise and the Brain,Wendy Suzuki,how exciting to be here. what i'm going to try...,exciting going try today add amazing lineup sp...
67,10 Seconds,Grant Oliphant,late one night in 1944 in german-occupied amst...,late one night german occupied amsterdam three...
68,Infinite potential of human voice,Conrad Ma,morning uh people often say that i got a stran...,morning uh people often say got strange abilit...


In [12]:
df_sinstopwords = df_sinstopwords.drop(columns=['Contenido'])
df_sinstopwords = df_sinstopwords.rename(columns={'Contenido_normalizado': 'Contenido'})

In [13]:
df_sinstopwords.to_csv('DATOS/BBDD_limpia_quitados_stopwords.csv', index=False)

## LIMPIEZA SIN QUITAR STOPWORDS

In [15]:
tqdm.pandas()

df_constopwords["Contenido_normalizado"] = df_constopwords["Contenido"].progress_apply(lambda x: texto_to_lower(expand_contractions(correct_text(expand_contractions(eliminar_espacios(x))))))

100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [29:24<00:00, 25.21s/it]


In [16]:
df_constopwords

Unnamed: 0,Titulo,Ponente,Contenido,Contenido_normalizado
0,The power of the vulnerability,Brené Brown,"So, I'll start with this: a couple years ago, ...",so it will start with this a couple years ago ...
1,A new life for mobile phones,Rajeevan Amirtharajah,okay so today I'm gonna talk about something ...,okay so today ism gonna talk about something t...
2,Strange answer to the psychopath,Jon Ronson,"The story starts: I was at a friend's house, a...",the story starts i was at a friend's house and...
3,Media and children,Dimitri Christakis,i'm a pediatrician a researcher and a parent a...,i am a pediatrician a researcher and a parent ...
4,Narrative humility,Sayantani DasGupta,my name is Sian tani Dasgupta I'm a medical hu...,my name is sign tank dasgupta ism a medical hu...
...,...,...,...,...
65,A Growth Mindset for a Creative Mind,Bosse Larsson,i will talk about two things first of all abo...,i will talk about two things first of all abou...
66,Exercise and the Brain,Wendy Suzuki,how exciting to be here. what i'm going to try...,how exciting to be here what i am going to try...
67,10 Seconds,Grant Oliphant,late one night in 1944 in german-occupied amst...,late one night in 1944 in german-occupied amst...
68,Infinite potential of human voice,Conrad Ma,morning uh people often say that i got a stran...,morning uh people often say that i got a stran...


In [17]:
df_constopwords = df.drop(columns=['Contenido'])
df_constopwords = df.rename(columns={'Contenido_normalizado': 'Contenido'})

In [18]:
df_constopwords.to_csv('DATOS/BBDD_limpia_sin_quitar_stopwords.csv', index=False)