In [2]:
import pandas as pd
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
documents = pd.read_csv('data_topic_model.csv', error_bad_lines=False)
documents.head()



  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0.1,Unnamed: 0,Review
0,0,Bosses are used to facilitate the registration...
1,1,Ribs are used in a design to increase the bend...
2,2,Mold drafts facilitate part removal from the m...
3,3,Inserts used in plastic parts provide a place ...
4,4,Ultrasonic insertion is when an insert is “vib...


In [10]:
# use tfidf by removing tokens that don't appear in at least 50 documents
vect = TfidfVectorizer(min_df=1, stop_words='english')
# Fit and transform
X = vect.fit_transform(documents.Review)

In [8]:
documents,min

(    Unnamed: 0                                             Review
 0            0  Bosses are used to facilitate the registration...
 1            1  Ribs are used in a design to increase the bend...
 2            2  Mold drafts facilitate part removal from the m...
 3            3  Inserts used in plastic parts provide a place ...
 4            4  Ultrasonic insertion is when an insert is “vib...
 5            5  This method uses a heated tool, like a solderi...
 6            6  This process is used to hollow out thick secti...
 7            7  The overmolding process is when a flexible mat...
 8            8  The most widely used overmolding process is in...
 9            9  This is a multi-material overmolding process t...
 10          10  Because the thickness of the material will be ...
 11          11  Parts with walls of uniform thickness allow th...,
 <function min>)

In [11]:
# Create an NMF instance: model
# the 10 components will be the topics
model = NMF(n_components=10, random_state=5)
# Fit the model to TF-IDF
model.fit(X)
# Transform the TF-IDF: nmf_features
nmf_features = model.transform(X)



In [12]:
X.shape

(12, 115)

In [13]:
nmf_features.shape

(12, 10)

In [14]:
model.components_.shape


(10, 115)

In [15]:
# Create a DataFrame: components_df
components_df = pd.DataFrame(model.components_, columns=vect.get_feature_names())
components_df



Unnamed: 0,acceptable,accepting,adding,allow,angle,attaching,avoid,barrels,bending,bond,...,ultrasonic,uniform,use,used,uses,using,varying,vibrated,walls,widely
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21346,...,0.0,0.0,0.0,0.150605,0.0,0.0,0.0,0.0,0.0,0.265345
1,0.0,0.207319,0.0,0.0,0.0,0.207319,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.280607,0.0,0.0,0.0,0.0,0.0,0.000524
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.010227,0.329317,0.0,0.0,0.0,0.0,0.012251
3,0.0,0.003538,0.0,0.0,0.40673,0.003538,0.0,0.0,0.0,0.0,...,2.020691e-09,0.0,0.203365,0.008963,0.0,6.735638e-10,0.0,6.735638e-10,0.0,0.017274
4,0.0,0.0,0.259361,0.0,0.0,0.0,0.0,0.0,0.518722,0.0,...,0.0,0.0,0.0,0.16183,0.0,0.0,0.0,0.0,0.0,0.002695
5,0.361093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.226777,0.0,0.0,0.0,0.0,0.0,0.005849
6,0.0,0.0,0.0,0.0,0.0,0.0,0.249124,4e-06,0.0,0.005156,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.6e-05,...,0.8706366,0.0,0.0,0.001014,0.0,0.2902122,0.0,0.2902122,0.0,0.0
8,0.0,0.0,0.0,0.32501,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.32501,0.0,0.003859,0.0,0.0,0.32501,0.0,0.32501,0.003192
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35718,0.0,0.0,...,0.0,0.0,0.0,0.00614,0.0,0.0,0.0,0.0,0.0,0.006654


In [16]:
for topic in range(components_df.shape[0]):
    tmp = components_df.iloc[topic]
    print(f'For topic {topic+1} the words with the highest value are:')
    print(tmp.nlargest(10))
    print('\n')

For topic 1 the words with the highest value are:
flexible       0.596860
substrate      0.596860
material       0.592745
molded         0.412844
overmolding    0.355998
process        0.310284
directly       0.265345
placed         0.265345
pre            0.265345
shot           0.265345
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
fasteners    0.391095
inserts      0.391095
screws       0.391095
parts        0.339184
used         0.280607
provide      0.246739
accepting    0.207319
attaching    0.207319
bosses       0.207319
mating       0.207319
Name: 1, dtype: float64


For topic 3 the words with the highest value are:
insert       0.509547
heat         0.329317
heated       0.329317
iron         0.329317
like         0.329317
melts        0.329317
method       0.329317
presses      0.329317
soldering    0.329317
tool         0.329317
Name: 2, dtype: float64


For topic 4 the words with the highest value are:
mold       0.632584
angle      0.406730
dr