In [1]:
import pandas as pd
import numpy as np
import re
import os
import json
import csv
from ast import literal_eval
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from octis.preprocessing.preprocessing import Preprocessing

In [29]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [27]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\paola\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\paola\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
path= "C:MyPath/Topic/"

In [97]:
df= pd.read_csv(path+ "vax_twitter.csv")

In [103]:
df =df.groupby('username').agg({'text': ' '.join})


In [104]:
def tokenize(text):
    tokens = re.split('\W+',text)
    return tokens

df['token'] = df['text'].apply(tokenize)

In [105]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(['from', 'll', 'would', 'doesn', 're','aaaaaand','aaaaand','aaaand'])

def remove_stopwords(text):
    tokenized_text_without_stopwords = []
    for token in text:
        if token not in stop_words:
            tokenized_text_without_stopwords.append(token)
    return tokenized_text_without_stopwords

df['token'] = df['token'].apply(remove_stopwords)

In [106]:
#defining the object for lemmatization
wn_lemmatizer = WordNetLemmatizer()

def lemmatizer(text):
    lemm_text = [wn_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

df['lemma'] = df['token'].apply(lemmatizer)

In [107]:
df['lemmatized']= [' '.join(map(str, l)) for l in df['lemma']]

In [109]:
df.to_csv(path + "vax_twitter_aggr.csv")

In [110]:
txt_file = (path + "vax_twitter_text.txt")
with open(txt_file, "w") as my_output_file:
    for line in df['lemmatized'].tolist():
        try:
            my_output_file.write("".join(line)+'\n') 
        except:
            pass
    my_output_file.close()


In [111]:
#os.chdir(os.path.pardir)
preprocessor = Preprocessing(vocabulary=None, max_features=None,remove_punctuation=True, min_df=5, lemmatize=False, 
                             stopword_list=None, min_chars=2, min_words_docs=3, split=True, save_original_indexes=True)
dataset = preprocessor.preprocess_dataset(documents_path=path + "vax_twitter_text.txt" , labels_path=None)

dataset.save(r"C:\MyPath\Vax_twitter")

created vocab
7500


In [2]:
from octis.dataset.dataset import Dataset
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.models.ProdLDA import ProdLDA

In [4]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder(path)

In [5]:
epoch=50
model= ProdLDA(solver='adam', num_epochs= epoch, num_topics=20)
optimizer = Optimizer()
npmi = Coherence(texts=dataset.get_corpus(), measure='c_npmi')
topic_diversity = TopicDiversity()
search_space = {"num_layers": Categorical({1, 2}), 
                "num_neurons": Categorical({50, 100, 150}),
                "num_topics": Categorical({5, 8, 10, 15}),
                "activation": Categorical({'sigmoid', 'relu', 'softplus'}), 
                "dropout": Real(0.2, 0.6)
}

In [10]:
optimization_runs=10
model_runs=1
optimization_result = optimizer.optimize(
    model, dataset, npmi, 
    search_space, 
    early_stop=True, 
    number_of_call=optimization_runs, 
    model_runs=model_runs, 
    extra_metrics=[topic_diversity], 
    save_path=path)

Current call:  0
Epoch: [1/50]	Samples: [16838/841900]	Train Loss: 1161.8715910498668	Time: 0:00:06.626065
Epoch: [1/50]	Samples: [3609/180450]	Validation Loss: 1105.152616505611	Time: 0:00:00.706944
Epoch: [2/50]	Samples: [33676/841900]	Train Loss: 1116.8262325549213	Time: 0:00:06.849839
Epoch: [2/50]	Samples: [3609/180450]	Validation Loss: 1102.6820237816917	Time: 0:00:00.744018
Epoch: [3/50]	Samples: [50514/841900]	Train Loss: 1114.011811488059	Time: 0:00:06.647463
Epoch: [3/50]	Samples: [3609/180450]	Validation Loss: 1103.4511799927266	Time: 0:00:00.733302
Epoch: [4/50]	Samples: [67352/841900]	Train Loss: 1112.9609437057363	Time: 0:00:06.665724
Epoch: [4/50]	Samples: [3609/180450]	Validation Loss: 1101.994151989817	Time: 0:00:00.741723
Epoch: [5/50]	Samples: [84190/841900]	Train Loss: 1110.856466517849	Time: 0:00:06.616229
Epoch: [5/50]	Samples: [3609/180450]	Validation Loss: 1098.539675658943	Time: 0:00:00.731150
Epoch: [6/50]	Samples: [101028/841900]	Train Loss: 1108.194803571627

Current call:  4
Epoch: [1/50]	Samples: [16838/841900]	Train Loss: 1172.5458333642653	Time: 0:00:09.800034
Epoch: [1/50]	Samples: [3609/180450]	Validation Loss: 1.2463222425346956e+20	Time: 0:00:00.827126
Epoch: [2/50]	Samples: [33676/841900]	Train Loss: 1120.5585993757609	Time: 0:00:09.474563
Epoch: [2/50]	Samples: [3609/180450]	Validation Loss: 6.324651085486597e+17	Time: 0:00:00.801619
Epoch: [3/50]	Samples: [50514/841900]	Train Loss: 1109.6496427467819	Time: 0:00:09.619173
Epoch: [3/50]	Samples: [3609/180450]	Validation Loss: 1113.3254131381616	Time: 0:00:00.812191
Epoch: [4/50]	Samples: [67352/841900]	Train Loss: 1103.818175870798	Time: 0:00:09.478268
Epoch: [4/50]	Samples: [3609/180450]	Validation Loss: 1093.0357667030341	Time: 0:00:00.807607
Epoch: [5/50]	Samples: [84190/841900]	Train Loss: 1097.1533414236028	Time: 0:00:09.385010
Epoch: [5/50]	Samples: [3609/180450]	Validation Loss: 1101.3554925619978	Time: 0:00:00.807960
Epoch: [6/50]	Samples: [101028/841900]	Train Loss: 1094.5

Epoch: [16/50]	Samples: [269408/841900]	Train Loss: 1095.401665990802	Time: 0:00:09.503535
Epoch: [16/50]	Samples: [3609/180450]	Validation Loss: 1100.2279614548697	Time: 0:00:00.778240
Epoch: [17/50]	Samples: [286246/841900]	Train Loss: 1093.8362599082989	Time: 0:00:09.571467
Epoch: [17/50]	Samples: [3609/180450]	Validation Loss: 1099.321759620047	Time: 0:00:00.794347
Epoch: [18/50]	Samples: [303084/841900]	Train Loss: 1094.2966260745784	Time: 0:00:09.459016
Epoch: [18/50]	Samples: [3609/180450]	Validation Loss: 1103.7960085679897	Time: 0:00:00.793583
Epoch: [19/50]	Samples: [319922/841900]	Train Loss: 1094.2210122663628	Time: 0:00:09.460917
Epoch: [19/50]	Samples: [3609/180450]	Validation Loss: 1102.0726309226932	Time: 0:00:00.789517
Early stopping
Current call:  7
Epoch: [1/50]	Samples: [16838/841900]	Train Loss: 1185.284194723361	Time: 0:00:10.048525
Epoch: [1/50]	Samples: [3609/180450]	Validation Loss: 8.311749132532169e+19	Time: 0:00:00.851981
Epoch: [2/50]	Samples: [33676/841900

Epoch: [7/50]	Samples: [117866/841900]	Train Loss: 1102.586644576952	Time: 0:00:06.772547
Epoch: [7/50]	Samples: [3609/180450]	Validation Loss: 1105.1977700064076	Time: 0:00:00.752322
Epoch: [8/50]	Samples: [134704/841900]	Train Loss: 1097.9216123581612	Time: 0:00:06.872302
Epoch: [8/50]	Samples: [3609/180450]	Validation Loss: 1102.8263915947978	Time: 0:00:00.753491
Epoch: [9/50]	Samples: [151542/841900]	Train Loss: 1096.021125819737	Time: 0:00:06.781436
Epoch: [9/50]	Samples: [3609/180450]	Validation Loss: 1091.0747823908112	Time: 0:00:00.736370
Epoch: [10/50]	Samples: [168380/841900]	Train Loss: 1096.5633678171212	Time: 0:00:06.771166
Epoch: [10/50]	Samples: [3609/180450]	Validation Loss: 1094.4108294801192	Time: 0:00:00.741944
Epoch: [11/50]	Samples: [185218/841900]	Train Loss: 1096.92849773856	Time: 0:00:06.775175
Epoch: [11/50]	Samples: [3609/180450]	Validation Loss: 1090.5619420415974	Time: 0:00:00.733392
Epoch: [12/50]	Samples: [202056/841900]	Train Loss: 1097.127722824791	Time:

In [None]:
results = json.load(open(path+ "result.json","r"))
max_n = results[ "f_val"].index(max(results[ "f_val"]))
params = [results["x_iters"][parameter][max_n] for parameter in results["x_iters"].keys()]

In [13]:
results['x_iters'].keys()

dict_keys(['activation', 'dropout', 'num_layers', 'num_neurons', 'num_topics'])

In [14]:
model = ProdLDA(solver='adam', num_epochs= 50, num_topics=10,
                activation= 'relu', dropout= 0.51, num_layers=2, num_neurons= 150,
                batch_size=10)
output = model.train_model(dataset)

Epoch: [1/50]	Samples: [16838/841900]	Train Loss: 1134.9879682844248	Time: 0:00:33.804795
Epoch: [1/50]	Samples: [3609/180450]	Validation Loss: 1143.9479177490302	Time: 0:00:01.018436
Epoch: [2/50]	Samples: [33676/841900]	Train Loss: 1121.1723154912809	Time: 0:00:33.623546
Epoch: [2/50]	Samples: [3609/180450]	Validation Loss: 1139.1670630992235	Time: 0:00:01.149272
Epoch: [3/50]	Samples: [50514/841900]	Train Loss: 1118.2702085452177	Time: 0:00:34.129766
Epoch: [3/50]	Samples: [3609/180450]	Validation Loss: 1147.826733858442	Time: 0:00:01.235061
Epoch: [4/50]	Samples: [67352/841900]	Train Loss: 1118.9821073833646	Time: 0:00:34.094274
Epoch: [4/50]	Samples: [3609/180450]	Validation Loss: 1133.4556759617342	Time: 0:00:01.153417
Epoch: [5/50]	Samples: [84190/841900]	Train Loss: 1117.9889372601222	Time: 0:00:35.496224
Epoch: [5/50]	Samples: [3609/180450]	Validation Loss: 1140.5650474102288	Time: 0:00:01.165171
Epoch: [6/50]	Samples: [101028/841900]	Train Loss: 1118.7085385116975	Time: 0:00:

In [1]:
#view topics
#for topic in output['topics']:
#    print(" ".join(topic))
#    print("----------------------------------------------------------------------------------------------------------")