In [1]:
import pandas as pd
import spacy
import language_tool_python
import gensim
from gensim import models
from nltk.corpus import stopwords
import json

import tqdm
import numpy as np
import re
import matplotlib.pyplot as plt

In [2]:
# define functions to preprocess the data
def lemmatize(text,nlp):
    # can be parallelized
    doc = nlp(text)
    lemma = [n.lemma_ for n in doc]
    return lemma

def preprocess(text,nlp):
    
    result = []
    for token in gensim.utils.simple_preprocess(text): #  gensim.utils.simple_preprocess tokenizes el texto
        token = ''.join(x for x in token.lower() if x.isalpha())
        if token not in palabrasVacias_nltk and len(token) > 2:
            result.append(token)       
    result = lemmatize(' '.join(result),nlp)
    return result

def remove_words(text):
    # Reemplazar simobolo por palabra para que no me elimine los hashtags
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b','',text, flags=re.MULTILINE) #Remove URL
    text = re.sub(r'@\w+','', text) # remove mentions
    return text

def correct_text(text):
    coincidencias = corrector.check(text)
    corrected = corrector.correct(text)
    return corrected

# load spanish language tool
corrector = language_tool_python.LanguageTool('es')
nlp = spacy.load('es_core_news_lg')

# define stopwords
palabrasVacias_nltk = stopwords.words('spanish')
palabrasVacias_nltk.append("usted")
palabrasVacias_nltk.append("uds")
palabrasVacias_nltk.append("hacer")
palabrasVacias_nltk.append("bien")
palabrasVacias_nltk.append("navidad")
palabrasVacias_nltk.append("jajaja")

In [3]:
# load the fitted LDA
lda_model = models.LdaModel.load("final_lda_model/final_lda_model.model")

# load the dictionary
dictionary_spanish = gensim.corpora.Dictionary.load("final_lda_model/final_lda_model.model.id2word")

# define the tfidf from the beginning of january (which we used to tune the LDA)
# read the old corpus and transform file
with open('final_lda_model/bow_corpus.json') as f:
   lda_corpus = [[tuple(i) for i in x] for x in json.load(f)]
tfidf = models.TfidfModel(lda_corpus)

In [4]:
# load tweet data and preprocess
tweets = pd.read_csv('data/tweets.csv')
tweets["is_reply"] = [int(~np.isnan(j)) for j in tweets["in_reply_twitter_id"]]
tweets['tweet_c'] = tweets['label'].apply(lambda x: remove_words(x))
preprocessed_tweets = tweets['tweet_c'].apply(lambda x: preprocess(x, nlp))

# define the bag of word corpus for all current tweets
bow_corpus = [dictionary_spanish.doc2bow(doc) for doc in preprocessed_tweets]

In [5]:
# get topics for all tweets
tfidf_corpus = tfidf[bow_corpus]
tpcs = lda_model[tfidf_corpus]

# extract the main topic and the corresponding score
topic, score = [], []
for t in tqdm.tqdm(tpcs):
    topic.append(int(np.argmax([j[1] for j in t])))
    score.append(float(np.max([j[1] for j in t])))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90971/90971 [01:29<00:00, 1018.41it/s]


In [21]:
# load the topic names from Bibian
topic_table = pd.read_excel("data/topic_names.xlsx")
topic_table["lda_topic"] = topic_table.index
topic_table["NAME"] = [j.replace("\n",'') for j in topic_table['NAME']]
topic_lda_dict = dict(zip(topic_table["lda_topic"],topic_table["NUMBER OF TOPICS"]))
topic_label_dict = dict(zip(topic_table["NUMBER OF TOPICS"],topic_table["NAME"]))

topic_list = []
count = 0 
for j in range(len(score)):
    if score[j] > 0.0625:
        topic_list.append((int(tweets['id'][j]),topic_lda_dict[topic[j]],score[j],int(tweets['is_reply'][j])))
    else:
        count += 1
print(f"{count} tweets without topic assignment")

9702 tweets without topic assignment


In [20]:
topic_list

[(1, 7, 0.3298790156841278, 0),
 (2, 10, 0.7453746795654297, 0),
 (3, 10, 0.625239372253418, 0),
 (4, 2, 0.2948283851146698, 0),
 (5, 9, 0.6427786350250244, 0),
 (6, 8, 0.5065872669219971, 0),
 (7, 10, 0.4905861020088196, 0),
 (8, 1, 0.32690340280532837, 0),
 (9, 5, 0.7708697319030762, 0),
 (10, 10, 0.41996678709983826, 0),
 (11, 2, 0.7038798332214355, 0),
 (12, 6, 0.6640855073928833, 0),
 (13, 10, 0.709193766117096, 0),
 (14, 10, 0.49624502658843994, 0),
 (15, 10, 0.6445345282554626, 0),
 (16, 10, 0.5972833037376404, 0),
 (17, 10, 0.6445556879043579, 0),
 (18, 6, 0.5039275884628296, 0),
 (19, 10, 0.5917335152626038, 0),
 (20, 10, 0.36431244015693665, 0),
 (21, 5, 0.4718078076839447, 0),
 (22, 10, 0.7629178166389465, 0),
 (23, 6, 0.4968182444572449, 0),
 (24, 10, 0.4573518931865692, 0),
 (25, 10, 0.7193025350570679, 0),
 (26, 10, 0.70334392786026, 0),
 (27, 10, 0.7530531883239746, 0),
 (28, 10, 0.6333606243133545, 0),
 (29, 1, 0.3799629807472229, 0),
 (30, 10, 0.7316645383834839, 0),
 

In [12]:
# # define the list of tuples
# def listOfTuples(l0,l1, l2,l3):
#     return list(map(lambda w, x, y,z:(int(w),x,y,z), l0,l1, l2,l3))

# topic_list = listOfTuples(tweets['id'],final_topic,final_score,tweets["is_reply"])

In [22]:
# upload data to database

import psycopg2
from sshtunnel import SSHTunnelForwarder

try:
    with SSHTunnelForwarder(
        ('161.35.123.231', 22),
        ssh_username="postgres",
        ssh_password="dbConn2021!",
        remote_bind_address=('localhost', 5432)) as server:
        
        print("server connected")
        
        keepalive_kwargs = {
            "keepalives": 1,
            "keepalives_idle": 60,
            "keepalives_interval": 10,
            "keepalives_count": 5
        } 
        
        params = {
            'database': 'tweetproject',
            'user': 'postgres',
            'password': 'padova2021',
            'host': server.local_bind_host,
            'port': server.local_bind_port,
            **keepalive_kwargs
        }
        
        conn = psycopg2.connect(**params)
        curs = conn.cursor()
        print("database connected")   
        
        sql = "INSERT INTO tweet_topic(id_tweet,id_topic,score,is_reply) VALUES(%s, %s, %s, %s)"
        curs.executemany(sql,topic_list)
        
        conn.commit()
        count = curs.rowcount
        print(count, "Record inserted successfully into tweet_topic")

except (Exception, psycopg2.Error) as error:
    print("Failed to insert record tweet_topic:", error)

server connected
database connected
81269 Record inserted successfully into tweet_topic
