<a href="https://colab.research.google.com/github/SVT23/Text-Mining-and-Language-/blob/main/TM_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TOPIC MODELING

Topic modeling allows us to automatically dsicover topics from a collection of documents. 

Shown below in code is basic python imports. Gets other libraries. 

In [4]:
import pandas as pd 
import numpy as np 
import nltk
from nltk.tokenize import RegexpTokenizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation 


# COVID SURVEY DATASET

In [5]:
# Read data from CSV file 
covid_survey = pd.read_csv('COVIDSurveydata.csv')
# filter data by text_long column 
covid_survey['text_long']

0       It is less an much an issue of how it affects ...
1       I am concerned that the true impact of the cur...
2       Personally, I am fairly calm about the corona ...
3       In this very moment as I am fortunate to be ab...
4       I am more worried about getting access to my n...
                              ...                        
2478    I feel sad for the loss of life and the pain t...
2479    I fear  that  the virus is more deadly than  w...
2480    I feel stressed and anxious about people ignor...
2481    It is quite worrying even though it said to ha...
2482    I feel helpless that in reality there's nothin...
Name: text_long, Length: 2483, dtype: object

# VECTORIZE THE DATA


In [6]:
# Vectorize survey using TF-IDF
tfidf_vectorizer = TfidfVectorizer(lowercase=True,
                            ngram_range = (1,2),
                            #max_df=0.95, min_df=2, 
                            #max_features=n_features, 
                            stop_words="english"
)

# Fit and Transform the documents
X = tfidf_vectorizer.fit_transform(covid_survey['text_long'])  

# get the actual words from the vectorized data
tf_feature_names = tfidf_vectorizer.get_feature_names_out()

print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 2483, n_features: 82254


# PERFORM LDA

In [7]:
# Define the number of TOPICS or components
# LDA topic modeling with 5 topics 
num_components=5

# Create LDA object
ldamodel=LatentDirichletAllocation(n_components=num_components)

# Fit and Transform model on data that has already been vectorized
lda_matrix = ldamodel.fit_transform(X)

# Get Components from the lda model
# components_[i, j] can be viewed as pseudocount that represents the number of 
# times word j was assigned to topic i. from scikit-learn
lda_components=ldamodel.components_

In [8]:
# view the topic models
n_top_words = 10 # how many words per topic

for i, topic in enumerate(lda_components):
    # numpy argsort: Returns the indices that would sort an array
    # we want the last n_top_words indices, as they have the highest counts 
    
    top_features_index = topic.argsort() [  :-n_top_words - 1 : -1 ]

    # based on indices, get the words, from the vectorizer features
    top_features = [tf_feature_names[i] for i in top_features_index]
    print('topic', i, top_features)

topic 0 ['people', 'feel', 'family', 'situation', 'virus', 'worried', 'anxious', 'going', 'time', 'think']
topic 1 ['feel', 'people', 'worried', 'family', 'situation', 'time', 'virus', 'home', 'anxious', 'work']
topic 2 ['feel', 'people', 'worried', 'family', 'time', 'situation', 'virus', 'home', 'anxious', 'government']
topic 3 ['people', 'feel', 'worried', 'situation', 'family', 'time', 'virus', 'anxious', 'don', 'going']
topic 4 ['feel', 'people', 'worried', 'family', 'situation', 'home', 'time', 'virus', 'work', 'anxious']


# **UNIGRAM**

In [9]:
from nltk.util import ngrams 
covid_survey = pd.read_csv('COVIDSurveydata.csv')
covid_survey['text_long']
n = 1
unigrams = ngrams(covid_survey, n)
for item in unigrams: 
  print(item)

('Unnamed: 0',)
('V1',)
('worry',)
('chosen_emotion',)
('anger',)
('disgust',)
('fear',)
('anxiety',)
('sadness',)
('happiness',)
('relaxation',)
('desire',)
('text_long',)
('timing_textlong_firstclick',)
('timing_textlong_lastclick',)
('timing_textlong_submit',)
('timing_textlong_nclicks',)
('text_short',)
('timing_textshort_firstclick',)
('timing_textshort_lastclick',)
('timing_textshort_submit',)
('timing_textshort_nclicks',)
('self_rating_general',)
('self_rating_short',)
('self_rating_long',)
('twitter_general_often',)
('twitter_tweet_often',)
('twitter_participate_often',)
('eng_native',)
('ntok_long',)
('nchar_long',)
('ntok_short',)
('nchar_short',)
('cld_lang_long',)
('cld_lang_short',)
('id',)
('n_punct',)
('prop_punct',)
('age',)
('Country of Birth',)
('Current Country of Residence',)
('Employment Status',)
('First Language',)
('Nationality',)
('Sex',)
('Social-Media',)
('Student Status',)




```
# This is formatted as code
```

# TRIGRAM

In [10]:
from nltk.util import ngrams 
covid_survey = pd.read_csv('COVIDSurveydata.csv')
covid_survey['text_long']
n = 3
unigrams = ngrams(covid_survey, n)
for item in unigrams: 
  print(item)

('Unnamed: 0', 'V1', 'worry')
('V1', 'worry', 'chosen_emotion')
('worry', 'chosen_emotion', 'anger')
('chosen_emotion', 'anger', 'disgust')
('anger', 'disgust', 'fear')
('disgust', 'fear', 'anxiety')
('fear', 'anxiety', 'sadness')
('anxiety', 'sadness', 'happiness')
('sadness', 'happiness', 'relaxation')
('happiness', 'relaxation', 'desire')
('relaxation', 'desire', 'text_long')
('desire', 'text_long', 'timing_textlong_firstclick')
('text_long', 'timing_textlong_firstclick', 'timing_textlong_lastclick')
('timing_textlong_firstclick', 'timing_textlong_lastclick', 'timing_textlong_submit')
('timing_textlong_lastclick', 'timing_textlong_submit', 'timing_textlong_nclicks')
('timing_textlong_submit', 'timing_textlong_nclicks', 'text_short')
('timing_textlong_nclicks', 'text_short', 'timing_textshort_firstclick')
('text_short', 'timing_textshort_firstclick', 'timing_textshort_lastclick')
('timing_textshort_firstclick', 'timing_textshort_lastclick', 'timing_textshort_submit')
('timing_textsho

# GENDER

In [11]:
# Read data from CSV file 
covid_survey = pd.read_csv('COVIDSurveydata.csv')
# filter data by Sex column 
covid_survey['Sex']

0          NaN
1          NaN
2          NaN
3       Female
4       Female
         ...  
2478       NaN
2479       NaN
2480       NaN
2481       NaN
2482       NaN
Name: Sex, Length: 2483, dtype: object