In [None]:
# 1. Load the dataset. Lowercase the dataset.
# 2. Vectorize the dataset using tfidf vectorizer
# 3. Truncate the dataset using truncated svd
# 4. Zip the words from tfidf vectorization process to their truncated vectors
# 5. Sort them and print the desired number of words in each topic.

In [1]:
# Starting with an example of 7 sentence list
data = ["The amount of pollution is increasing day by day",
"The concert was just great", "I love to see Gordon Ramsay cook",
"Google is introducing a new technlogy", "AI robots are great example of great technology present today",
"All of us were singing in the concert", "We have launch campaigns to stop pollution and global warming"]

In [2]:
# Importing the required libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import numpy as np
import warnings

In [3]:
# Converting the sentences to lower case
data = [i.lower() for i in data]

In [8]:
# Vectorize the data using TFIDF Vectorizer
tfidf = TfidfVectorizer()
data_vec = tfidf.fit_transform(data)
data_vec

<7x42 sparse matrix of type '<class 'numpy.float64'>'
	with 51 stored elements in Compressed Sparse Row format>

In [11]:
#Truncate the vectorized data
from sklearn.decomposition import TruncatedSVD
trunc_svd = TruncatedSVD(n_components=4, n_iter=100, random_state=42)

In [12]:
trunc_svd.fit(data_vec)

TruncatedSVD(algorithm='randomized', n_components=4, n_iter=100,
             random_state=42, tol=0.0)

In [20]:
trunc_svd.n_components

4

In [15]:
trunc_svd.components_[3]

array([ 0.20230022, -0.20857394, -0.07085104, -0.03746641,  0.20230022,
       -0.07085104, -0.03746641, -0.14801398,  0.09516616, -0.14170208,
        0.20230022, -0.03746641,  0.21834424,  0.09516616,  0.36097342,
       -0.03746641, -0.20857394, -0.07085104,  0.21834424,  0.12243203,
        0.03026229, -0.03746641,  0.09516616,  0.21834424, -0.05472232,
       -0.08991278,  0.20230022,  0.09516616,  0.20230022,  0.09516616,
       -0.20857394, -0.03746641,  0.21834424,  0.20230022, -0.17678838,
        0.04789575,  0.20230022, -0.20857394, -0.03746641,  0.03026229,
       -0.03746641, -0.20857394])

In [17]:
# Get the order of unique words from the TF-IDF'ed array
word_list = tfidf.get_feature_names()

In [18]:
len(word_list)

42

In [22]:
for i,y in enumerate(trunc_svd.components_):
    componentwords = zip(word_list, y) # Combines the words and their vectors
    sortedComponentwords = sorted(componentwords, key = lambda x: x[1], reverse=True) # Sorts the word by its vector
    sortedComponentwords = sortedComponentwords[:5] # Gives only top 5 sorted words
    print ("\nTopic", i, ";")
    for x in sortedComponentwords:
        print (x)


Topic 0 ;
('great', 0.40984162355340126)
('the', 0.3787231472218794)
('concert', 0.35741668117165964)
('of', 0.2773738067325903)
('just', 0.2598047527022691)

Topic 1 ;
('to', 0.32184144941377196)
('pollution', 0.2641076135719797)
('day', 0.24902077191893007)
('is', 0.2290114925683931)
('cook', 0.19406202788196772)

Topic 2 ;
('is', 0.3442407374203242)
('google', 0.2936874640351261)
('introducing', 0.2936874640351261)
('new', 0.2936874640351261)
('technlogy', 0.2936874640351261)

Topic 3 ;
('great', 0.3609734156929444)
('google', 0.2183442429592279)
('introducing', 0.2183442429592279)
('new', 0.2183442429592279)
('technlogy', 0.2183442429592279)


In [None]:
# Topic modelling for consumer complaints dataset

In [23]:
import pandas as pd
import numpy as np

df = pd.read_csv("C:/Users/Shlagha Rastogi/Downloads/SWTA/Consumer_Complaints1.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [24]:
print(df.shape)
df.head()

(242038, 18)


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,07/17/2019,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account status incorrect,,,ALLY FINANCIAL INC.,TX,75035,,,Web,07/17/2019,In progress,Yes,,3309495.0
1,07/17/2019,Credit card or prepaid card,General-purpose prepaid card,Trouble using the card,Problem using the card to withdraw money from ...,,,SQUARE INC,KS,674XX,,,Web,07/17/2019,In progress,Yes,,3310031.0
2,07/17/2019,Debt collection,Other debt,Took or threatened to take negative or legal a...,Threatened or suggested your credit would be d...,,,"Diversified Consultants, Inc.",FL,,,,Web,07/17/2019,In progress,Yes,,3309687.0
3,07/17/2019,Mortgage,VA mortgage,Trouble during payment process,,,,"FLAGSTAR BANK, FSB",VA,22554,Servicemember,,Web,07/17/2019,In progress,Yes,,3308925.0
4,07/17/2019,Debt collection,Other debt,Attempts to collect debt not owed,Debt is not yours,,Company believes it acted appropriately as aut...,BYL Collection Services,TN,370XX,Servicemember,,Web,07/17/2019,Closed with explanation,Yes,,3308914.0


In [27]:
consumer_complaints = df["Consumer complaint narrative"]
consumer_complaints.shape

(242038,)

In [28]:
consumer_complaints.dropna(inplace=True)
consumer_complaints.shape

(77608,)

In [30]:
consumer_complaints_small = consumer_complaints.iloc[:100]
consumer_complaints_small.shape

(100,)

In [35]:
corpus = consumer_complaints_small.tolist()
len(corpus)

100

In [36]:
# Importing libraries for tokenizing, lemmatizing and removing stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())

from nltk.corpus import stopwords
from string import punctuation
words_to_be_removed = list(stopwords.words("english"))+list(punctuation)

[nltk_data] Downloading package words to C:\Users\Shlagha
[nltk_data]     Rastogi\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [37]:
final_corpus = []
for i in range(len(corpus)):
    word = word_tokenize(corpus[i].lower())
    text = [i for i in word if len(i)>2]
    text = [lemmatizer.lemmatize(i) for i in text if i not in words_to_be_removed]
    sent = " ".join(text)
    
    final_corpus.append(sent)

In [38]:
new_df = pd.DataFrame(final_corpus)
new_df.columns = ["Complaints"]
new_df.head()

Unnamed: 0,Complaints
0,duplicate contacted xxxx xxxx experian fraudul...
1,16000.00 debt
2,call dont answer call call day long havent sig...
3,provided account information showed debt paid ...
4,charge credit report collection agency trident...


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

vec = tfidf.fit_transform(new_df["Complaints"])
vec

<100x1390 sparse matrix of type '<class 'numpy.float64'>'
	with 4473 stored elements in Compressed Sparse Row format>

In [42]:
vec_arr = vec.toarray()
vec_arr.shape

(100, 1390)

In [43]:
from sklearn.decomposition import TruncatedSVD
trunc_svd = TruncatedSVD(n_components=4, n_iter=100, random_state=42)

trunc_svd.fit(vec_arr)

TruncatedSVD(algorithm='randomized', n_components=4, n_iter=100,
             random_state=42, tol=0.0)

In [48]:
words = tfidf.get_feature_names()

In [49]:
for i,y in enumerate(trunc_svd.components_):
    componentwords = zip(words, y)
    sortedComponentwords = sorted(componentwords, key = lambda x: x[1], reverse=True)
    sortedComponentwords = sortedComponentwords[:10]
    print ("\nTopic", i, ";")
    for x in sortedComponentwords:
        print (x)


Topic 0 ;
('xxxx', 0.8436184256419903)
('xx', 0.23667866168731486)
('account', 0.1374704672553495)
('credit', 0.12412703971101438)
('debt', 0.10885507315944638)
('00', 0.10202088314645022)
('report', 0.09819075410314837)
('consumer', 0.09008631710852537)
('information', 0.07765661342225733)
('reporting', 0.07360601912791781)

Topic 1 ;
('consumer', 0.5994588949412877)
('theft', 0.2797277779852393)
('information', 0.27282039505150774)
('identity', 0.26608444024417127)
('block', 0.23126327219099904)
('agency', 0.2259668215753394)
('section', 0.21633369069795225)
('reporting', 0.19199044854001146)
('victim', 0.17398405223125307)
('shall', 0.14783833381353909)

Topic 2 ;
('victim', 0.572615054778553)
('theft', 0.45225731101170724)
('identity', 0.4074066459387022)
('xxxx', 0.16933069042150484)
('item', 0.04626300591741292)
('apartment', 0.03035759862060084)
('negative', 0.029131465998954557)
('fraudulent', 0.02344512089583706)
('rental', 0.02230571306093733)
('department', 0.02173658686532