In [1]:
import pandas as pd
import io
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from google.colab import files

## Data loading

In [2]:
# Mounting the google drive to google colab in order to load the data files directly from it
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# The data can be load directly from it
# You must change the path if the data is not directly in the path EPITA_NLP/Course1/ of the google drive
quora = pd.read_csv('/content/drive/MyDrive/EPITA_NLP/Course1/quora_questions.csv')
print(quora.head(30))
# We consider only yhe 10000 first files in order to decrease the computation time
texts = quora["Question"][0:10000]

                                             Question
0   What is the step by step guide to invest in sh...
1   What is the story of Kohinoor (Koh-i-Noor) Dia...
2   How can I increase the speed of my internet co...
3   Why am I mentally very lonely? How can I solve...
4   Which one dissolve in water quikly sugar, salt...
5   Astrology: I am a Capricorn Sun Cap moon and c...
6                                 Should I buy tiago?
7                      How can I be a good geologist?
8                     When do you use シ instead of し?
9   Motorola (company): Can I hack my Charter Moto...
10  Method to find separation of slits using fresn...
11        How do I read and find my YouTube comments?
12               What can make Physics easy to learn?
13        What was your first sexual experience like?
14  What are the laws to change your status from a...
15  What would a Trump presidency mean for current...
16                       What does manipulation mean?
17  Why do girls want to be 

## Non-negative Matrix Factorization

Pre-processing

In [4]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words="english")
dtm = tfidf.fit_transform(texts)

Use of the NMF algorithm

In [5]:
NMF_ = NMF(n_components=7,random_state=42)
NMF_.fit(dtm)

In [6]:
NMF_.components_

array([[9.79166867e-03, 0.00000000e+00, 4.00033022e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.11315571e-03, 6.58958491e-04, 7.35163799e-03, ...,
        0.00000000e+00, 3.38024578e-03, 1.36348676e-02],
       [3.53157439e-02, 0.00000000e+00, 2.41039258e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 1.13203888e-03, ...,
        1.40125305e-06, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.79008576e-04, 1.55207805e-02, ...,
        6.00541966e-05, 0.00000000e+00, 6.52477952e-03],
       [7.81899941e-03, 2.78094199e-04, 1.35370752e-02, ...,
        0.00000000e+00, 0.00000000e+00, 6.92509172e-04]])

In [7]:
NMF_.components_.shape

(7, 5168)

Let's have a look at the most representative words of each topic


In [8]:
for index,topic in enumerate(NMF_.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['start', 'movie', 'weight', 'english', 'learning', 'books', 'book', 'programming', '2016', 'language', 'movies', 'india', 'learn', 'way', 'best']


THE TOP 15 WORDS FOR TOPIC #1
['sex', 'time', 'says', 'english', 'use', 'compare', 'cost', 'exist', 'love', 'india', 'long', 'feel', 'work', 'mean', 'does']


THE TOP 15 WORDS FOR TOPIC #2
['free', 'day', 'did', 'black', 'friends', 'notes', '500', 'india', '1000', 'ways', 'way', 'earn', 'online', 'money', 'make']


THE TOP 15 WORDS FOR TOPIC #3
['marked', 'people', 'post', 'earn', 'needing', 'improvement', 'asked', 'google', 'delete', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']


THE TOP 15 WORDS FOR TOPIC #4
['knowledge', 'java', 'machine', 'systems', 'transgender', 'bank', 'computer', 'information', 'main', 'engineering', 'science', 'data', 'job', 'love', 'difference']


THE TOP 15 WORDS FOR TOPIC #5
['person', 'things', 'want', 'believe', 'world', 'donald', 'stop', 'don', 'did', 'trump', '

Associate explicitly each text to a topic

In [9]:
topic_results_NMF = NMF_.transform(dtm)
topic_results_NMF

array([[6.34194031e-03, 4.22510779e-03, 5.83846150e-03, ...,
        3.54516835e-04, 4.46587176e-03, 3.34470495e-03],
       [1.53036369e-03, 1.88241982e-03, 1.27676323e-03, ...,
        0.00000000e+00, 7.96215954e-04, 0.00000000e+00],
       [2.21955399e-03, 1.14461995e-03, 4.30790975e-03, ...,
        1.76867685e-04, 1.82502095e-03, 1.82282079e-03],
       ...,
       [5.07371035e-04, 7.80905625e-04, 1.37769903e-03, ...,
        1.72430017e-04, 1.95258480e-03, 1.56414155e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.87498092e-01],
       [1.90292697e-03, 2.31604354e-04, 0.00000000e+00, ...,
        8.94592648e-05, 3.97311074e-04, 4.09598154e-04]])

In [10]:
best_topic_results_NMF = topic_results_NMF.argmax(axis=1)
best_topic_results_NMF

array([0, 1, 3, ..., 3, 6, 0])

In [11]:
df_topic_NMF = pd.DataFrame({"text":texts,"topic_LDA":best_topic_results_NMF})
df_topic_NMF

Unnamed: 0,text,topic_LDA
0,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,1
2,How can I increase the speed of my internet co...,3
3,Why am I mentally very lonely? How can I solve...,1
4,"Which one dissolve in water quikly sugar, salt...",1
...,...,...
9995,How would you order these four cities (Bangalo...,5
9996,Stphen william hawking?,0
9997,Mathematical Puzzles: What is () + () + () = 3...,3
9998,Is IMS noida good for BCA?,6


## Latent Dirichlet Allocation (LDA)

Pre-processing

In [12]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words="english")
dtm = cv.fit_transform(texts)

Use of the LDA algorithm

In [13]:
LDA = LatentDirichletAllocation(n_components=7,random_state=42)
LDA.fit(dtm)

In [14]:
LDA.components_

array([[ 2.76451366,  0.14311993, 10.19314347, ...,  0.1428574 ,
         0.14285746,  0.1434078 ],
       [12.44976244,  0.14294005, 11.45899252, ...,  2.14285544,
         0.14285749,  0.14285757],
       [ 1.3313155 ,  0.14285726,  5.26836191, ...,  0.14285747,
         0.14285754,  0.14293166],
       ...,
       [ 1.18617085,  0.14285725,  0.14297319, ...,  0.14285743,
         0.14285749,  6.1431953 ],
       [ 0.14296719,  2.14251102,  0.15440711, ...,  0.14285743,
         1.14306868,  1.14172508],
       [ 2.84294391,  0.14285725,  0.44971014, ...,  0.14285744,
         0.14315076,  0.14285758]])

In [15]:
LDA.components_.shape

(7, 5168)

Let's have a look at the most representative words of each topic


In [16]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['meaning', 'women', 'day', 'black', 'know', 'happen', 'use', 'good', 'people', 'feel', 'world', 'work', 'mean', 'like', 'does']


THE TOP 15 WORDS FOR TOPIC #1
['makes', 'ask', 'google', 'way', 'increase', 'used', 'movies', 'lose', 'question', 'questions', 'difference', 'weight', 'people', 'best', 'quora']


THE TOP 15 WORDS FOR TOPIC #2
['like', 'rid', 'different', 'body', 'human', 'help', 'instagram', 'time', 'travel', 'possible', 'iphone', 'way', 'buy', 'best', 'life']


THE TOP 15 WORDS FOR TOPIC #3
['movie', 'examples', 'exam', 'years', '2016', 'time', 'good', 'programming', 'language', 'old', 'year', 'start', 'india', 'learn', 'best']


THE TOP 15 WORDS FOR TOPIC #4
['software', 'money', 'pakistan', 'going', 'people', 'rs', 'english', 'improve', 'know', '1000', '500', 'notes', 'indian', 'think', 'india']


THE TOP 15 WORDS FOR TOPIC #5
['facebook', 'university', 'learning', 'password', 'hillary', 'clinton', 'did', 'difference', 'job', 'donald', 'goo

Associate explicitly each text to a topic

In [17]:
topic_results_LDA = LDA.transform(dtm)
topic_results_LDA

array([[0.01786184, 0.01785901, 0.01798001, ..., 0.01787742, 0.01786066,
        0.01790007],
       [0.0476191 , 0.71415668, 0.04761911, ..., 0.0476191 , 0.04765793,
        0.04770898],
       [0.02041649, 0.20954881, 0.0204987 , ..., 0.02042326, 0.02041183,
        0.02040903],
       ...,
       [0.14705452, 0.01787374, 0.01789926, ..., 0.01786041, 0.01786808,
        0.01786091],
       [0.04775166, 0.04767432, 0.04765453, ..., 0.04763673, 0.0478086 ,
        0.04770726],
       [0.01787732, 0.01798615, 0.89241139, ..., 0.01785719, 0.01795108,
        0.01788883]])

In [18]:
best_topic_results_LDA = topic_results_LDA.argmax(axis=1)
best_topic_results_LDA

array([3, 1, 3, ..., 3, 3, 2])

In [19]:
df_topic_LDA = pd.DataFrame({"text":texts,"topic_LDA":best_topic_results_LDA})
df_topic_LDA

Unnamed: 0,text,topic_LDA
0,What is the step by step guide to invest in sh...,3
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,1
2,How can I increase the speed of my internet co...,3
3,Why am I mentally very lonely? How can I solve...,0
4,"Which one dissolve in water quikly sugar, salt...",6
...,...,...
9995,How would you order these four cities (Bangalo...,3
9996,Stphen william hawking?,0
9997,Mathematical Puzzles: What is () + () + () = 3...,3
9998,Is IMS noida good for BCA?,3


In [20]:
df_topic_LDA_NMF = df_topic_LDA
df_topic_LDA_NMF["topic_NMF"] = best_topic_results_NMF
df_topic_LDA_NMF

Unnamed: 0,text,topic_LDA,topic_NMF
0,What is the step by step guide to invest in sh...,3,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,1,1
2,How can I increase the speed of my internet co...,3,3
3,Why am I mentally very lonely? How can I solve...,0,1
4,"Which one dissolve in water quikly sugar, salt...",6,1
...,...,...,...
9995,How would you order these four cities (Bangalo...,3,5
9996,Stphen william hawking?,0,0
9997,Mathematical Puzzles: What is () + () + () = 3...,3,3
9998,Is IMS noida good for BCA?,3,6


Reference: https://www.udemy.com/course/nlp-natural-language-processing-with-python