# TeeHunch

## importamos librerias

In [None]:
import requests
import os
import nltk 
import re
import string
from dotenv import load_dotenv
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.probability import FreqDist
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk import word_tokenize, pos_tag, pos_tag_sents


### Librerias de nltk

#### nltk.download('punkt')
#### nltk.download('stopwords')
#### nltk.download('tagsets')
#### nltk.download('averaged_perceptron_tagger')


# Obtener datos de la API 

## Cargar valor del Token en la aplicación

In [None]:
import os
from dotenv import load_dotenv
# Cargar valores del archivo .env en las variables de entorno
load_dotenv()
# Cargar valor del token a variable
bearer_token = os.environ.get("BEARER_TOKEN")

### definimos consulta a la API

In [None]:
url = "https://api.twitter.com/2/tweets/search/recent"

### definimos parametros

In [None]:
params = {
    'query': '#venom lang:en -is:retweet',
    'tweet.fields':'created_at',
    'max_results':100
}
total_page = 10


### definimos cabecera

In [None]:
headers = {
    "Authorization": f"Bearer {bearer_token}",
    "User-Agent":"TweeHunch"
} 

## Obtener tweets recursivamente

In [None]:
response = requests.get(url, headers=headers, params=params)
print(response)

# Generar excepción si la respuesta no es exitosa
if response.status_code != 200:
    raise Exception(response.status_code, response.text)
print(dict(response.json())['meta'])

def get_data(url,params,total_page):
    results = []
    count = 0
    while count < total_page:
        count += 1
        response = requests.get(url, headers=headers, params=params)
        # Generar excepción si la respuesta no es exitosa
        if response.status_code != 200:
            raise Exception(response.status_code, response.text)
        data = response.json()['data']
        meta_data = dict(response.json())['meta']
        results.append(pd.json_normalize(data))
        if 'next_token' not in meta_data:
            break
        else:
            token = meta_data['next_token']
            print(token)
            params = {
                'query': '#venom lang:en -is:retweet',
                'tweet.fields':'created_at',
                'next_token':token,
                'max_results':100
            }
    return pd.concat(results)

df = get_data(url,params, total_page)
#df.drop(columns=['withheld.copyright','withheld.country_codes'],inplace=True)


## Filtrar columnas

In [None]:
df = df[['text']]
#df

## Guardas tweets en csv

In [None]:
df.to_csv('tweets.csv')

# Tokenizamos

## Filtro

In [None]:
URL_REGEX = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
MENTIONS_REGEX = r"(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)"
HASHTAG_REGEX = r"#"

df["text"].replace(URL_REGEX,'',regex=True, inplace = True)
df["text"].replace(MENTIONS_REGEX,'',regex=True, inplace = True)
df["text"].replace(HASHTAG_REGEX,'',regex=True, inplace = True)
df["text"].replace(r"[^A-Za-z0-9 | \n]+",' ',regex=True, inplace = True)
df["text"].replace(r"\t",' ',regex=True, inplace = True)
df["text"].replace('[{}]'.format(string.punctuation),' ',regex=True, inplace = True)

df["text"] = df["text"].str.lower()

In [None]:
# Tokenizar

tt = TweetTokenizer()

tokenized_text = df['text'].apply(tt.tokenize)
df["tokenized_text"] = tokenized_text

## Guardas tweets tokenizados en csv

In [None]:
df.to_csv('tweets_tokenizados.csv')

## Frecuencia de tweets tokenizados

In [None]:
tokenized_list = df.explode('tokenized_text')
# Obtener frecuencia de cada término
fdist = FreqDist(tokenized_list['tokenized_text'])
# Convertir a dataframe
df_fdist = pd.DataFrame.from_dict(fdist, orient='index')
df_fdist.columns = ['Frequency']
df_fdist.index.name = 'Term'
df_fdist.sort_values(by=['Frequency'], inplace=True, ascending=False)
pd.set_option('display.max_rows', None)
#df_fdist

## Nube de palabras de tweets tokenizados

In [None]:
# Generar nube de palabras
wordcloud = WordCloud(max_words=1000, background_color="white", collocations = False,min_font_size = 6).generate(df['tokenized_text'].to_string())

# Mostrar gráfico
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.rcParams['figure.figsize'] = [100, 100]
plt.show()

# Stop Words

In [None]:
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))
no_stopwords_data = []
# Crear lista sin stopwords
for x in tokenized_text:
    for word in x:
        if word.lower() not in stopwords:
            no_stopwords_data.append(word)
#no_stopwords_data

In [None]:
# convertirmos la lista de stopwords a dataframe para tratar los datos para la frecuencia

df_no_stopwords = pd.DataFrame(no_stopwords_data)
#df_no_stopwords[df["no_stopwords"].str.contains("the")]
df_no_stopwords.rename(columns={0: 'no_stopwords'}, inplace=True)
#df_no_stopwords

In [None]:
#lista de abecedario para filtrar las letras sueltas
lista_abc = ['a','b','c','d','e','f','g','h','i','j','k','n','m','l','o','p','r','s','t','u','v','w','x','y','z']

#filtrado de numeros, campos vacios y signos ? 
df_no_stopwords["no_stopwords"].replace(r"[ \d | \s | ?.* ]+",'',regex=True, inplace = True)
faltantes_index = df_no_stopwords[df_no_stopwords['no_stopwords'] == ''].index
df_no_stopwords = df_no_stopwords.drop(faltantes_index, axis=0)

#filtrado de las letras sueltas
for i in range(len(lista_abc)):
	faltantes_index = df_no_stopwords[df_no_stopwords['no_stopwords'] == lista_abc[i]].index
	df_no_stopwords = df_no_stopwords.drop(faltantes_index, axis=0)


In [None]:
tokenized_list = df_no_stopwords.explode('no_stopwords')
# Obtener frecuencia de cada término
fdist = FreqDist(tokenized_list['no_stopwords'])
# Convertir a dataframe
df_fdist = pd.DataFrame.from_dict(fdist, orient='index')
df_fdist.columns = ['Frequency']
df_fdist.index.name = 'Term'
df_fdist.sort_values(by=['Frequency'], inplace=True, ascending=False)
pd.set_option('display.max_rows', None)
#df_fdist

In [None]:


################################################################

"""
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize = (50, 50))
x = df_fdist["Term"].tolist()
y = df_fdist["Frequency"].tolist()

plt.show()
"""


## tweets tokenizados sin stop words a csv

In [None]:
df.to_csv('tweets_without_stopwords.csv')
#df

## Nube de palabras de tweets tokenizados sin stop words

In [None]:
# Generar nube de palabras
data = " ".join(map(str,no_stopwords_data))
wordcloud = WordCloud(max_words=1000, background_color="white", collocations = False,min_font_size = 6).generate(data)

# Mostrar gráfico
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.rcParams['figure.figsize'] = [150, 150]
plt.show()

# Lematizacion

## Aplicamos etiquetado POS

In [None]:
list_filter = df_no_stopwords["no_stopwords"].tolist()

#print(list_filter)

In [None]:
# Obtener frecuencia de list_filter
fdist = FreqDist(list_filter)
# Convertir a dataframe
df_fdist_list_filter = pd.DataFrame.from_dict(fdist, orient='index')
df_fdist_list_filter.columns = ['Frequency']
df_fdist_list_filter.index.name = 'Term'
df_fdist_list_filter.sort_values(by=['Frequency'], inplace=True, ascending=False)

#df_fdist_list_filter

In [None]:
# Generar nube de palabras
data = " ".join(map(str,list_filter))
wordcloud = WordCloud(max_words=1000, background_color="white", collocations = False,min_font_size = 6).generate(data)

# Mostrar gráfico
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.rcParams['figure.figsize'] = [150, 150]
plt.show()

In [None]:
# Etiquetar texto con pos_tag
data_pos = nltk.pos_tag(list_filter)
#data_pos

In [None]:
# Creamos patron regex
p_adj = re.compile(r"(^JJ)")
p_noun = re.compile(r"(^NN)")
p_verb = re.compile(r"(^VB)")
p_adverb = re.compile(r"(^RB)")

adjectives, nouns, verbs, adverbs, others = [], [], [], [], []
for k,v in data_pos:
    if re.fullmatch(p_adj, v):
        if k == "venom" or k == "carnage" or k == "cr":
            pass
        else:
            adjectives.append(k)

    elif re.fullmatch(p_noun, v):
        if k == "cr":
            pass
        else:
            nouns.append(k)

    elif re.fullmatch(p_verb, v):
        if k == "venom" or k == "carnage" or k == "cr":
            pass
        else:
            verbs.append(k)

    elif re.fullmatch(p_adverb, v):
        if k == "venom" or k == "carnage" or k == "cr":
            pass
        else:
            adverbs.append(k)
            
    else:
        if k == "venom" or k == "carnage" or k == "cr":
            pass
        else:
            others.append(k)

#print(f'adjetivos: {adjectives} \n\n\n\n\n sustantivos: {nouns} \n\n\n\n\n\n verbos: {verbs} \n\n\n\n\n\n adverbios:{adverbs} \n\n\n\n\n\n otros: {others}')

In [None]:
# Obtener frecuencia de adjetivos
fdist = FreqDist(adjectives)
# Convertir a dataframe
df_fdist_adj = pd.DataFrame.from_dict(fdist, orient='index')
df_fdist_adj.columns = ['Frequency']
df_fdist_adj.index.name = 'Term'
df_fdist_adj.sort_values(by=['Frequency'], inplace=True, ascending=False)

#df_fdist_adj

In [None]:
# Generar nube de palabras
data = " ".join(map(str,adjectives))
wordcloud = WordCloud(max_words=1000, background_color="white", collocations = False,min_font_size = 6).generate(data)

# Mostrar gráfico
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.rcParams['figure.figsize'] = [150, 150]
plt.show()

In [None]:
# Obtener frecuencia de sustantivos
fdist = FreqDist(nouns)
# Convertir a dataframe
df_fdist_noun = pd.DataFrame.from_dict(fdist, orient='index')
df_fdist_noun.columns = ['Frequency']
df_fdist_noun.index.name = 'Term'
df_fdist_noun.sort_values(by=['Frequency'], inplace=True, ascending=False)

#df_fdist_noun

In [None]:
# Generar nube de palabras
data = " ".join(map(str,nouns))
wordcloud = WordCloud(max_words=1000, background_color="white", collocations = False,min_font_size = 6).generate(data)

# Mostrar gráfico
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.rcParams['figure.figsize'] = [150, 150]
plt.show()

In [None]:
# Obtener frecuencia de verbos
fdist = FreqDist(verbs)
# Convertir a dataframe
df_fdist_verbs = pd.DataFrame.from_dict(fdist, orient='index')
df_fdist_verbs.columns = ['Frequency']
df_fdist_verbs.index.name = 'Term'
df_fdist_verbs.sort_values(by=['Frequency'], inplace=True, ascending=False)

#df_fdist_verbs

In [None]:
# Generar nube de palabras
data = " ".join(map(str,verbs))
wordcloud = WordCloud(max_words=1000, background_color="white", collocations = False,min_font_size = 6).generate(data)

# Mostrar gráfico
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.rcParams['figure.figsize'] = [150, 150]
plt.show()

In [None]:
# Obtener frecuencia de adverbios
fdist = FreqDist(adverbs)
# Convertir a dataframe
df_fdist_adv = pd.DataFrame.from_dict(fdist, orient='index')
df_fdist_adv.columns = ['Frequency']
df_fdist_adv.index.name = 'Term'
df_fdist_adv.sort_values(by=['Frequency'], inplace=True, ascending=False)

#df_fdist_adv

In [None]:
# Generar nube de palabras
data = " ".join(map(str,adverbs))
wordcloud = WordCloud(max_words=1000, background_color="white", collocations = False,min_font_size = 6).generate(data)

# Mostrar gráfico
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.rcParams['figure.figsize'] = [150, 150]
plt.show()

In [None]:
# Obtener frecuencia de adverbios
fdist = FreqDist(others)
# Convertir a dataframe
df_fdist_other = pd.DataFrame.from_dict(fdist, orient='index')
df_fdist_other.columns = ['Frequency']
df_fdist_other.index.name = 'Term'
df_fdist_other.sort_values(by=['Frequency'], inplace=True, ascending=False)

#df_fdist_other

In [None]:
# Generar nube de palabras
data = " ".join(map(str,others))
wordcloud = WordCloud(max_words=1000, background_color="white", collocations = False, min_font_size = 6).generate(data)

# Mostrar gráfico
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.rcParams['figure.figsize'] = [150, 150]
plt.show()

# Análisis de polaridad

In [None]:
dfSentiment =  pd.read_csv("tweets_without_stopwords.csv")
#dfSentiment.drop(columns=['Unnamed: 0'],inplace=True)
dfSentiment.drop(columns=['tokenized_text', 'Unnamed: 0'],inplace=True)
#dfSentiment

In [None]:
URL_REGEX = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
MENTIONS_REGEX = r"(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)" #(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)
HASHTAG_REGEX = r"(?<=^|(?<=[^a-zA-Z0-9-_\.]))#([A-Za-z]+[A-Za-z0-9-_]+)"

dfSentiment["text"].replace(URL_REGEX,'',regex=True, inplace = True)
dfSentiment["text"].replace(MENTIONS_REGEX,'',regex=True, inplace = True)
dfSentiment["text"].replace(HASHTAG_REGEX,'',regex=True, inplace = True)
dfSentiment["text"].replace(r"[^A-Za-z0-9 | \n]+",' ',regex=True, inplace = True)
dfSentiment["text"].replace(r"[\t | \n]",' ',regex=True, inplace = True)
dfSentiment["text"].replace('[{}]'.format(string.punctuation),' ',regex=True, inplace = True)

dfSentiment["text"] = dfSentiment["text"].str.lower()
#dfSentiment



In [None]:
# Instanciar Analizador
sentiment_analyzer = SentimentIntensityAnalyzer()
dfSentiment["negative"] = ""
dfSentiment["neutral"] = ""
dfSentiment["positive"] = ""
dfSentiment["result"] = ""
count_negative = 0
count_neutral = 0
count_positive = 0

for index, row in dfSentiment.iterrows():
    #Analizar cada review
    analisis = sentiment_analyzer.polarity_scores(row['text'])
    row["negative"] = analisis["neg"]
    row["neutral"] = analisis["neu"]
    row["positive"] = analisis["pos"]
    # Evaluar que valores se considerarán positivo o negativo
    if analisis['compound'] >= 0.45 :
        count_positive += 1
        row["result"] = "Positive"
    elif analisis['compound'] <= -0.24 :
        count_negative += 1
        row["result"] = "Negative"
    else :
        count_neutral += 1
        row["result"] = "Neutral"
total = count_negative+count_neutral+count_positive
print(f'positivo: {count_positive} negativo: {count_negative} neutral: {count_neutral} total: {total}')
print(f'positivo: {count_positive/total} negativo: {count_negative/total} neutral: {count_neutral/total}')
#dfSentiment


## WordCloud de positivos

In [None]:
from nltk.tokenize import TweetTokenizer

df_aux = dfSentiment[dfSentiment['result'] == 'Positive']

# Tokenizar
tt = TweetTokenizer()

tokenized_text = df_aux['text'].apply(tt.tokenize)
df_aux["tokenized_text"] = tokenized_text

In [None]:
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))
no_stopwords_data = []
# Crear lista sin stopwords
for x in tokenized_text:
    for word in x:
        if word.lower() not in stopwords:
            no_stopwords_data.append(word)
#print(no_stopwords_data)

In [None]:
data = " ".join(map(str,no_stopwords_data))
# Generar nube de palabras
wordcloud = WordCloud(max_words=1000, background_color="white", collocations = False, min_font_size = 20).generate(data)

# Mostrar gráfico
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.rcParams['figure.figsize'] = [150, 150]
plt.show()

# WordCloud de negativos

In [None]:
df_aux = dfSentiment[dfSentiment['result'] == 'Negative']

# Tokenizar
tt = TweetTokenizer()

tokenized_text = df_aux['text'].apply(tt.tokenize)
df_aux["tokenized_text"] = tokenized_text

In [None]:
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))
no_stopwords_data = []
# Crear lista sin stopwords
for x in tokenized_text:
    for word in x:
        if word.lower() not in stopwords:
            no_stopwords_data.append(word)
#print(no_stopwords_data)

In [None]:
data = " ".join(map(str,no_stopwords_data))
# Generar nube de palabras
wordcloud = WordCloud(max_words=1000, background_color="white", collocations = False, min_font_size = 20).generate(data)

# Mostrar gráfico
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.rcParams['figure.figsize'] = [150, 150]
plt.show()

In [None]:
#tweet mas negativo
pd.set_option('display.max_columns', None)  
print(df_aux)
for i in df_aux['text']:
    print(i)