In [1]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import os
from contractions import contractions_dict
import contractions
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from spacy.lang.en import STOP_WORDS as spacy_stopwords
import spacy
from tqdm import tqdm
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from collections import Counter

In [None]:
!python -m spacy download en_core_web_md

In [None]:
nlp = spacy.load("en_core_web_md",disable=["ner","parser"])

In [None]:
combined_stopwords = set(stopwords.words('english')).union(set(spacy_stopwords))

In [None]:
data = pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv",
                  encoding="latin-1",header=None,names=["sentiment","id","date","flag","username","text"])

In [None]:
data.head()

In [None]:
data.drop(labels=data.columns[1:5],axis=1,inplace=True)

In [None]:
data.head()

In [None]:
def normalize_tweet(tweet):

    return tweet.lower()

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(normalize_tweet,data["text"]))

In [None]:
def fix_contractions(tweet): #For example I’ll be there within 5 min. Are u not gng there? Am I mssng out on smthng? I’d like to see u near d park.

    return contractions.fix(tweet)

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(fix_contractions,data["text"]))

In [None]:
def remove_noisy_tokens(tweet):

    return re.sub(pattern=r'@[a-zA-Z0-9 ]+|#[a-zA-Z0-9 ]+|\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*|\W+|\d+|<("[^"]*"|\'[^\']*\'|[^\'">])*>|_+|[^\u0000-\u007f]+',
                 string=tweet,repl=" ")

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(remove_noisy_tokens,data["text"]))

In [None]:
def remove_remaining_noisy_tokens(tweet):

    return re.sub(pattern=r'\b\w\b|[^\u0000-\u007f]+|_+|\W+',
                 string=tweet,repl=" ")

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(remove_remaining_noisy_tokens,data["text"]))

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(word_tokenize,data["text"]))

In [None]:
def is_stopword(token):

    return token not in combined_stopwords

In [None]:
def remove_stopwords(tokenized_tweet):

    return [token for token in tokenized_tweet if is_stopword(token)]

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(remove_stopwords,data["text"]))

In [None]:
with open("stopwords_removed.pkl","wb") as file_handle:
    pickle.dump(data["text"],file_handle)

In [None]:
def lemmatize_tweet(tokenized_tweet):

    raw_tweet = " ".join(tokenized_tweet)
    doc = nlp(raw_tweet)
    lemmatized_tweet = list()

    for token in doc:
        lemmatized_tweet.append(token.lemma_)

    return lemmatized_tweet

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(tqdm(pool.map(lemmatize_tweet,data["text"])))

In [None]:
with open("lemmatized_tweets.pkl","wb") as file_handle:
    pickle.dump(data["text"],file_handle)

In [3]:
data = pd.DataFrame()

with open("/kaggle/input/lemmantized-data/lemmatized_tweets.pkl","rb") as file_handle:
    data["text"] = pickle.load(file_handle)

In [4]:
data['text'].sample(5)

1481742                                       [like, dreamy]
606693                        [ugh, dream, mind, want, hard]
308846                  [hello, hello, feel, prepared, exam]
134335                                                    []
1078077    [plan, concert, philippine, summer, tour, way,...
Name: text, dtype: object

In [5]:
converted_raw_text = list(data["text"].apply(lambda x: " ".join(x)))

In [6]:
converted_raw_text = list(filter(lambda x: len(x) > 0,converted_raw_text))

In [7]:
len(converted_raw_text)

1408026

In [8]:
vocab = set()

for cleaned_tweet in converted_raw_text:
    vocab.update(set(cleaned_tweet.split(" ")))

In [12]:
len(vocab)

273488

In [9]:
vectorizer = TfidfVectorizer()
vectorized_text = vectorizer.fit_transform(converted_raw_text)

In [10]:
vectorized_text.shape

(1408026, 273469)

In [11]:
cumulative_tfs = Counter()
for cleaned_tweet in data["text"]:

    cumulative_tfs.update(cleaned_tweet)

In [12]:
most_frequent_tokens = cumulative_tfs.most_common(30000)
most_frequent_tokens = dict(most_frequent_tokens)
truncated_vocab = list(most_frequent_tokens.keys())

truncated_vocab2idx = dict(zip(truncated_vocab,range(len(truncated_vocab)))) #indexing

In [13]:
len(truncated_vocab2idx)

30000

In [14]:
vectorizer = TfidfVectorizer(vocabulary=truncated_vocab2idx)
truncated_tfidf_matrix = vectorizer.fit_transform(converted_raw_text)



In [15]:
truncated_tfidf_matrix.shape

(1408026, 30000)

<h1>SVD = U.^.U.T</h1>
<h4>U and v.T is Orthogonal matrix and ^/sigma is a diagnal matrix</h4>
<h4>having in a U matrix value is called eigen value and U and V are unitary it means U*U.T = U.T * U = identity matrix</h4>
<h4>same thing with V,  V*V.T = V.T * V = identity</h4>
<h4>Sigma is the diagnal matrix sigam1 >= sigma2 >= sigma3 >= sigma_n >= 0</h4>
<h2>U is called Left singular vector and V is called right singular vectors and sigma is called is sigular value sigma values ordered by importance </h2>

<h2>V.T is a word embeding matrix</h2>

In [None]:
# left_singular_U => A*A.T - lamda * identity matrix = 0

In [16]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=64) # Reduce to 64 dimensions
svd.fit(truncated_tfidf_matrix)
U_k = svd.transform(truncated_tfidf_matrix)
V_k = svd.components_
S = svd.singular_values_

In [17]:
# S_k = np.sqrt(svd.explained_variance_ * (vectorized_text.shape[0]-1))

In [17]:
U_k.shape, V_k.shape, S.shape

((1408026, 64), (64, 30000), (64,))

In [20]:
num_top_words = 15
terms = vectorizer.get_feature_names_out()

for i, topic in enumerate(V_k):

    top_indices = topic.argsort()[-num_top_words:][::-1]
    top_words = [terms[index] for index in top_indices]
    

    print(f"Topic: {i+1}: {', '.join(top_words)}")


document_topic_distribution  = U_k[1,:]
print(f" Document First topic distribution: {document_topic_distribution}")
print(len(document_topic_distribution))

Topic: 1: work, day, good, go, get, today, want, quot, like, time, love, feel, miss, home, morning
Topic: 2: work, want, get, tomorrow, ready, hour, today, home, tired, till, hard, bored, early, weekend, ugh
Topic: 3: quot, work, want, love, know, like, lol, get, watch, think, miss, new, say, song, amp
Topic: 4: good, quot, work, morning, day, luck, night, world, afternoon, mood, sound, say, beautiful, monday, news
Topic: 5: day, quot, happy, today, great, mother, school, long, beautiful, nice, enjoy, father, sunny, rainy, bad
Topic: 6: want, day, love, know, lol, like, thank, bad, home, happy, feel, come, new, get, mother
Topic: 7: love, miss, work, thank, lt, lol, twitter, guy, song, new, friend, happy, get, ya, amp
Topic: 8: miss, want, good, day, quot, work, morning, home, friend, baby, leave, school, luck, boyfriend, talk
Topic: 9: like, get, feel, lol, know, well, miss, look, thank, time, think, sick, bad, today, hope
Topic: 10: get, home, ready, time, night, sleep, new, tomorrow

In [22]:
data_df = np.array(V_k)

In [None]:
# #A_k  =  U_k * S_k * V_k
# A_k = np.dot(np.dot(U_k, S_k_diag), V_k)

In [21]:
word_vectors =  V_k.T
word_to_index = vectorizer.vocabulary_  #index

In [22]:
word_vectors.shape

(30000, 64)

In [27]:
index_to_word = {}

for word, index in word_to_index.items():

    index_to_word[index] = word
    
    

In [None]:
index_to_word

In [37]:
 #Example: Get vector for a specific word
word = "example"
word_index = word_to_index[word]
vector = word_vectors[word_index]

In [38]:
word_index

3159

In [32]:
vector

array([ 4.33154210e-04, -3.02649039e-04,  2.37097552e-04,  2.35383958e-04,
       -1.66199202e-04,  3.46233588e-06,  6.24457738e-05, -5.93673345e-05,
        1.67724744e-05, -1.43797639e-05,  1.33193757e-04,  8.29545024e-05,
        5.91614674e-06,  5.70298681e-06, -4.38143883e-06, -1.94142186e-04,
       -6.89824626e-05,  2.08339532e-05,  3.30724928e-05,  3.96527073e-04,
        4.30015571e-05,  3.06636192e-05, -1.89400065e-07, -9.04715229e-05,
       -1.37021803e-04, -2.41186193e-04, -1.72679985e-04, -6.26326591e-05,
       -8.97852394e-05,  7.41388908e-06,  2.62658544e-04, -1.77423195e-04,
        1.05992713e-05, -5.61983015e-05, -2.14315970e-05, -2.69773376e-05,
       -2.84150043e-04, -2.82254523e-05, -3.43658711e-05,  2.69792687e-05,
        2.40058764e-04, -1.41502084e-04, -3.12568911e-05,  2.52656853e-04,
       -2.72973054e-05,  1.02369312e-05,  3.75508866e-04,  1.64255225e-04,
       -2.51260364e-04, -2.14597960e-05, -2.63469674e-05, -3.54526849e-05,
       -2.57629340e-05,  