## Análisis de la longitud de los textos: 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
text_ds = pd.read_csv(r'C:\Users\10and\OneDrive\Documentos\GitHub\4geeks_finalproject_modeling\data\raw\all_hotscrape_v2(elbueno).csv')

text_ds.head(5)


Unnamed: 0,text,submission_type,subreddit,label
0,Media: Nobody knows what kamala is about\n\nMe...,comment,politics,kamala
1,NYT breaking news that Netanyahu has agreed to...,comment,politics,trumper
2,I love how the stock crash 2 weeks ago was HUG...,comment,politics,trump
3,I was thinking this morning about how freaking...,comment,politics,trumper
4,Conservative in a purple state. I'm voting for...,comment,politics,trump


Procesamiento del texto:

In [3]:
def preprocess_text(text):
    # Convertir el texto a minúsculas
    text = text.lower()

    # Eliminar cualquier carácter que no sea una letra (a-z) o un espacio en blanco ( )
    text = re.sub(r'[^a-z ]', " ", text)

    # Eliminar espacios en blanco
    text = re.sub(r'\s+[a-zA-Z]\s+', " ", text)
    text = re.sub(r'\^[a-zA-Z]\s+', " ", text)

    # Reducir espacios en blanco múltiples a uno único
    text = re.sub(r'\s+', " ", text)

    # Eliminar tags
    text = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", text)

    return text.split()

In [4]:
# Aplicamos la función a nuestro dataset:
text_ds['text'] = text_ds['text'].apply(preprocess_text)

Lematización: 

In [5]:
# Instanciamos el lematizador:
download("wordnet")
lemmatizer = WordNetLemmatizer()

download("stopwords")
stop_words = stopwords.words("english")

def lemmatize_text(words, lemmatizer = lemmatizer):
    # lematiza
    tokens = [lemmatizer.lemmatize(word) for word in words]
    # saca stop words
    tokens = [word for word in tokens if word not in stop_words]
    # se queda con las de largo mayor a
    tokens = [word for word in tokens if len(word) > 3]
    return tokens

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\10and\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\10and\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Aplicamos la función a nuestro dataset:
text_ds['text'] = text_ds['text'].apply(lemmatize_text)

Palabras más frecuentes:

In [7]:
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk

In [8]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\10and\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\10and\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\10and\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\10and\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [9]:
# Convertir las listas en cadenas de texto
text_ds['text'] = text_ds['text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

In [10]:
all_words = ' '.join(text.lower() for text in text_ds['text'])
print(all_words[:20])

medium nobody know k


In [11]:
text_ds

Unnamed: 0,text,submission_type,subreddit,label
0,medium nobody know kamala medium back nonstop ...,comment,politics,kamala
1,breaking news netanyahu agreed term ceasefire ...,comment,politics,trumper
2,love stock crash week huge news every station ...,comment,politics,trump
3,thinking morning freaking immaculate past mont...,comment,politics,trumper
4,conservative purple state voting harris hell o...,comment,politics,trump
...,...,...,...,...
187608,would apply trump lost fundamentally different...,comment,Republican,trump
187609,many pardon trump issued,comment,Republican,trump
187610,well trump felon,comment,Republican,trump
187611,post keep blasting harris shit even political ...,comment,Republican,kamala


In [12]:
word_freq = Counter(word_tokenize(all_words, language='english', preserve_line=True))
common_words = word_freq.most_common(20)

In [13]:
common_words

[('trump', 121040),
 ('people', 66065),
 ('would', 58251),
 ('biden', 50604),
 ('like', 49349),
 ('think', 43157),
 ('vote', 33438),
 ('even', 31504),
 ('state', 30262),
 ('right', 29058),
 ('thing', 26353),
 ('time', 26030),
 ('party', 25149),
 ('election', 24410),
 ('republican', 23861),
 ('also', 23768),
 ('want', 23687),
 ('make', 23639),
 ('year', 23635),
 ('harris', 23597)]

In [14]:
text_ds.to_csv('preprocess_text', index = False)

In [15]:
text_ds['label'].value_counts()

label
trump      85349
trumper    85158
kamala     11697
kamaler     5409
Name: count, dtype: int64