In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv('/content/gdrive/MyDrive/SMA Practice/Dataset/social_media_data.csv')
df.head()

Unnamed: 0,id,text
0,1,Just had a great meal at the new restaurant do...
1,2,I love going to the beach with my family in th...
2,3,Can't wait to see the new movie coming out thi...
3,4,"Feeling stressed out at work today, need a vac..."
4,5,I can't believe how fast time is flying by


In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import gensim
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
tweets = df['text'].values.tolist()
tweets

['Just had a great meal at the new restaurant downtown!',
 'I love going to the beach with my family in the summer',
 "Can't wait to see the new movie coming out this weekend",
 'Feeling stressed out at work today, need a vacation',
 "I can't believe how fast time is flying by",
 'The concert last night was amazing, the band sounded even better live',
 "I'm so excited to start my new job next week!",
 'Feeling down today, need some motivation to get through the day',
 'Spent the afternoon hiking in the mountains, the view was incredible',
 'Just finished reading a great book, highly recommend it!',
 'Having a lazy day at home watching movies and eating popcorn',
 'Feeling grateful for my family and friends today',
 'The weather is perfect for a day at the park',
 "I can't believe it's already been a year since I graduated from college",
 'Excited to start planning my next vacation!']

In [7]:
stemmer = SnowballStemmer('english')

tweets_processed = []
for tweet in tweets:
    tokens = word_tokenize(tweet.lower())
    filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    tweets_processed.append(stemmed_tokens)

tweets_processed

[['great', 'meal', 'new', 'restaur', 'downtown', '!'],
 ['love', 'go', 'beach', 'famili', 'summer'],
 ['ca', "n't", 'wait', 'see', 'new', 'movi', 'come', 'weekend'],
 ['feel', 'stress', 'work', 'today', ',', 'need', 'vacat'],
 ['ca', "n't", 'believ', 'fast', 'time', 'fli'],
 ['concert',
  'last',
  'night',
  'amaz',
  ',',
  'band',
  'sound',
  'even',
  'better',
  'live'],
 ["'m", 'excit', 'start', 'new', 'job', 'next', 'week', '!'],
 ['feel', 'today', ',', 'need', 'motiv', 'get', 'day'],
 ['spent', 'afternoon', 'hike', 'mountain', ',', 'view', 'incred'],
 ['finish', 'read', 'great', 'book', ',', 'high', 'recommend', '!'],
 ['lazi', 'day', 'home', 'watch', 'movi', 'eat', 'popcorn'],
 ['feel', 'grate', 'famili', 'friend', 'today'],
 ['weather', 'perfect', 'day', 'park'],
 ['ca', "n't", 'believ', "'s", 'alreadi', 'year', 'sinc', 'graduat', 'colleg'],
 ['excit', 'start', 'plan', 'next', 'vacat', '!']]

In [8]:
# Create a dictionary of unique words
dictionary = gensim.corpora.Dictionary(tweets_processed)

In [9]:
# Convert each tweet into a bag-of-words representation
corpus = []

for tweet in tweets_processed:
  bow = dictionary.doc2bow(tweet)
  corpus.append(bow)

corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)],
 [(6, 1), (7, 1), (8, 1), (9, 1), (10, 1)],
 [(4, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)],
 [(18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1)],
 [(11, 1), (14, 1), (25, 1), (26, 1), (27, 1), (28, 1)],
 [(18, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1)],
 [(0, 1), (4, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1)],
 [(18, 1), (19, 1), (20, 1), (22, 1), (44, 1), (45, 1), (46, 1)],
 [(18, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1)],
 [(0, 1), (2, 1), (18, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1)],
 [(13, 1), (44, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1)],
 [(7, 1), (19, 1), (22, 1), (63, 1), (64, 1)],
 [(44, 1), (65, 1), (66, 1), (67, 1)],
 [(11, 1),
  (14, 1),
  (25, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1)],
 [(0, 1), (23, 1), (39, 1), (41, 1), (42, 1), (74, 1)]]

In [13]:
# Train an LDA model on the bag-of-words representation
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=dictionary,
                                            num_topics=5, # number of topics to identify
                                            random_state=42,
                                            passes=10)

In [16]:
!pip install pyLDAvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.0-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
Collecting funcy
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.0


In [18]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

  and should_run_async(code)
  default_term_info = default_term_info.sort_values(
