### Model Training
- This notebook provides the code to quickly configure settings, train and then save a model, and then loading it to display its results. If you already have a trained model, execute the cells from the middle of the notebook (the section will be labelled).

In [None]:
# Change these settings
config = {
    # Name to save model as, change the first 3 letters to 'lda' or 'nmf' will use the respective model
    'model_name': 'lda_model_1.pckl',   
    # Whether to drop all tweets that contains URLs, TWITTER DATASET ONLY
    'drop_linked_tweets': True,
    # Minimum document frequency cut-off
    'min_df': 5,                        
    # Maximum document frequency cut-off
    'max_df': 0.95,                      
    # Range of number words a term can have
    'ngram_range' : (1, 2),             
    # Dataset to read & train from
    'corpus': './dataset/twitter.csv',     
    # Column name containing the texts, load_twitter_csv() will be used if this is "tweet", else normal pandas.DataFrame constructor is called
    'df_col_name': 'tweet',             
    # Set to None to disable and read all rows
    'nrows': None,                     
    # Vectoriser to use, 'tfidf' or 'bow'
    'vectoriser': 'tfidf',              
    # Number of topics to find
    'n_topics': 15,                      
    # Max iterations for model training
    'max_iter': 1000                      
}

---

In [None]:
from nlp import *
import pandas as pd

resources = ['corpora/stopwords', 'corpora/wordnet',
             'taggers/averaged_perceptron_tagger']

check_nltk_resources(resources)

stop_words = stopwords.words('english')

stop_words.extend(
    list(string.punctuation) + [
        'would', 'could', 'get', 'want', 'he', 'twitter', 'elon', 'musk', 
        'well', 'need', 'come', 'really', 'take', 'say', 'go', 'use', 'make',
        'know', 'think', 'deal'
    ]
)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD

# Load the CSV file using appropriate function
if config['df_col_name'] != 'tweet':
    df = pd.read_csv(config['corpus'], nrows=config['nrows'])
else:
    df = load_twitter_csv(
        config['corpus'], 
        do_preprocess=False, 
        nrows=config['nrows'], 
        drop_linked_tweets=config['drop_linked_tweets']
    )


preprocess_df(df, txt_col=config['df_col_name'], stop_words=stop_words,
              inplace=True)
corpus = df[f"{config['df_col_name']}_preprocessed"]


# Use specified vectoriser
if config['vectoriser'] == 'tfidf':
    config['vectoriser'] = TfidfVectorizer(
        min_df=config['min_df'], max_df=config['max_df'],
        ngram_range=config['ngram_range']
    )
elif config['vectoriser'] == 'bow':
    config['vectoriser'] = CountVectorizer(
        min_df=config['min_df'], max_df=config['max_df'],
        ngram_range=config['ngram_range']
    )
else:
    raise ValueError(f"Invalid vectoriser name '{config['vectoriser']}'")


doc_term = config['vectoriser'].fit_transform(corpus)


# Use specified model
match config['model_name'][:3]:
    case 'lda':
        model = LatentDirichletAllocation(
            n_components=config['n_topics'],
            max_iter=config['max_iter'],
            verbose=1
        )
    case 'nmf':
        model = NMF(
            n_components=config['n_topics'],
            max_iter=config['max_iter'],
            verbose=1
        )
    case _:
        raise ValueError(f"Invalid model name '{config['model_name']}'")


model.info = config
model.info['vectoriser'].get_feature_names = \
    model.info['vectoriser'].get_feature_names_out


doc_topics = model.fit(doc_term)

In [None]:
# Save the trained model
import pickle

save_path = f"./models/{config['model_name']}"

with open(save_path, "wb") as file:
    pickle.dump(model, file)

---
### Run the cells from this point onwards to load in a pre-trained model

In [None]:
# Change this setting

#model_path = "./models/lda_model_1.pckl"
model_path = save_path

In [None]:
# Load a saved model
import pickle
from nlp import *
import pandas as pd


with open(model_path, "rb") as file:
    model = pickle.load(file)

print(f"Loaded in model with info: {model.info}")

In [None]:
from nlp import *
import pandas as pd

resources = ['corpora/stopwords', 'corpora/wordnet',
             'taggers/averaged_perceptron_tagger']

check_nltk_resources(resources)

stop_words = stopwords.words('english')

stop_words.extend(
    list(string.punctuation) + [
        'would', 'could', 'get', 'want', 'he', 'twitter', 'elon', 'musk', 
        'well', 'need', 'come', 'really', 'take', 'say', 'go', 'use', 'make',
        'know', 'think', 'deal'
    ]
)


if model.info['df_col_name'] != 'tweet':
    df = pd.read_csv(model.info['corpus'], nrows=model.info['nrows'])
else:
    df = load_twitter_csv(
        model.info['corpus'], 
        do_preprocess=False,
        nrows=model.info['nrows'], 
        drop_linked_tweets=model.info['drop_linked_tweets']
    )

preprocess_df(
    df, txt_col=model.info['df_col_name'], 
    stop_words=stop_words,
    inplace=True
)
corpus = df[f"{model.info['df_col_name']}_preprocessed"]

doc_terms = model.info['vectoriser'].transform(corpus)

doc_topics = model.transform(doc_terms)

In [None]:
print_topic_terms(
    topic_terms=model.components_,
    vocab=model.info['vectoriser'].get_feature_names_out(),
    n_words=50,
    dump_to_file=True,
    dump_file_name="./model_training_topic_terms_dump.txt"
)

print_doc_topics(
    doc_topics=doc_topics,
    corpus=df[model.info['df_col_name']],
    n_docs=100,
    dump_to_file=True,
    dump_file_name="./model_training_doc_topics_dump.txt"
)

In [None]:
import pyLDAvis.sklearn

%matplotlib widget

plot_document_matrix(doc_topics, dimension=2, decomposer='tsne')
plt.show()

pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(model, doc_terms, model.info['vectoriser'])