Dataset used:
https://www.kaggle.com/datasets/nelgiriyewithana/emotions?resource=download

In [60]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import BernoulliRBM
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
import math
import joblib

In [62]:
df = pd.read_csv('text_data/twitter_posts.csv', usecols=['text'],dtype=str, nrows=10000)
text_data = df['text']
vectorizer = TfidfVectorizer(stop_words="english")
X_train = vectorizer.fit_transform(text_data)
print(X_train.shape)
joblib.dump(vectorizer,"exports/text_vectorizer.pkl")
print("vectorizer saved")

(10000, 11495)
vectorizer saved


In [63]:
new_text = ["I was walking through the park this morning it felt really relaxing."]
text_transform = vectorizer.transform(new_text)
feature_names = vectorizer.get_feature_names_out()
dense_array = text_transform.toarray()[0]  # Convert to array
word_tfidf = {feature_names[i]: dense_array[i] for i in range(len(feature_names)) if dense_array[i] > 0}

# Print results
print("TF-IDF Values for New Text:")
for word, tfidf_value in word_tfidf.items():
    print(f"{word}: {tfidf_value:.4f}")

TF-IDF Values for New Text:
felt: 0.3184
morning: 0.3607
park: 0.5140
really: 0.2271
relaxing: 0.5274
walking: 0.4178


In [64]:
rbm = BernoulliRBM(n_components=3, learning_rate=0.001,n_iter=3, random_state=0, verbose=True) #not working because of sparse matrix
rbm.fit(X_train)
text_encoded = rbm.transform(text_transform)
print(text_encoded)

[BernoulliRBM] Iteration 1, pseudo-likelihood = -5690.29, time = 3.59s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -4279.44, time = 3.68s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -3361.03, time = 3.62s
[[0.89103769 0.89291238 0.8904413 ]]


In [65]:
svd = TruncatedSVD(n_components=4, n_iter=10) #dimensionality reduction specific for sparsse matrices
svd.fit(X_train)
joblib.dump(svd,"exports/svd_model_text.pkl")
print("svd modle saved")
text_encoded = svd.transform(text_transform)
print(text_encoded)
print(svd.explained_variance_ratio_.sum())

svd modle saved
[[ 0.05657259 -0.00200136 -0.05442332  0.0939384 ]]
0.020944383495729885


In [66]:
#to do add visualisation for several encoding of new text captions