## Topic Modelling after Sentiment Analysis
### Using TF-IDF & LDA (Latent Dirichlet Allocation) Algorithm

In [1]:
#pip install nltk

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
raw_dataset = pd.read_csv('mabel.txt', header = None, on_bad_lines='warn', encoding = 'utf8')

Skipping line 44: expected 2 fields, saw 3

  raw_dataset = pd.read_csv('mabel.txt', header = None, on_bad_lines='warn', encoding = 'utf8')


In [4]:
raw_dataset

Unnamed: 0,0,1
0,05/12/19,1:42 pm - Messages to this chat and calls are...
1,05/12/19,1:42 pm - Mabel Infoziant: Hi this is Mabel w...
2,05/12/19,1:42 pm - Mabel Infoziant: What’s your full name
3,05/12/19,1:42 pm - AR❤: Ramisha Rani K
4,05/12/19,1:42 pm - Mabel Infoziant: Ok
5,05/12/19,1:42 pm - Mabel Infoziant: ramisharanik@gmail...
6,05/12/19,1:43 pm - Mabel Infoziant: Your email Id?
7,05/12/19,1:43 pm - AR❤: Yes Mam
8,05/12/19,1:43 pm - Mabel Infoziant: I will send 2 abst...
9,05/12/19,1:43 pm - AR❤: Yeah mam


In [5]:
df = raw_dataset

In [6]:
df.columns = ['Date','Chat']

In [7]:
df

Unnamed: 0,Date,Chat
0,05/12/19,1:42 pm - Messages to this chat and calls are...
1,05/12/19,1:42 pm - Mabel Infoziant: Hi this is Mabel w...
2,05/12/19,1:42 pm - Mabel Infoziant: What’s your full name
3,05/12/19,1:42 pm - AR❤: Ramisha Rani K
4,05/12/19,1:42 pm - Mabel Infoziant: Ok
5,05/12/19,1:42 pm - Mabel Infoziant: ramisharanik@gmail...
6,05/12/19,1:43 pm - Mabel Infoziant: Your email Id?
7,05/12/19,1:43 pm - AR❤: Yes Mam
8,05/12/19,1:43 pm - Mabel Infoziant: I will send 2 abst...
9,05/12/19,1:43 pm - AR❤: Yeah mam


In [8]:
df = df.drop(0)

In [9]:
df

Unnamed: 0,Date,Chat
1,05/12/19,1:42 pm - Mabel Infoziant: Hi this is Mabel w...
2,05/12/19,1:42 pm - Mabel Infoziant: What’s your full name
3,05/12/19,1:42 pm - AR❤: Ramisha Rani K
4,05/12/19,1:42 pm - Mabel Infoziant: Ok
5,05/12/19,1:42 pm - Mabel Infoziant: ramisharanik@gmail...
6,05/12/19,1:43 pm - Mabel Infoziant: Your email Id?
7,05/12/19,1:43 pm - AR❤: Yes Mam
8,05/12/19,1:43 pm - Mabel Infoziant: I will send 2 abst...
9,05/12/19,1:43 pm - AR❤: Yeah mam
10,05/12/19,1:43 pm - Mabel Infoziant: Give me the list t...


### Splitting Message --> Time, Name & Message

In [10]:
Message = df['Chat'].str.split('-',n=1,expand=True)

In [11]:
Message

Unnamed: 0,0,1
1,1:42 pm,Mabel Infoziant: Hi this is Mabel we just spoke
2,1:42 pm,Mabel Infoziant: What’s your full name
3,1:42 pm,AR❤: Ramisha Rani K
4,1:42 pm,Mabel Infoziant: Ok
5,1:42 pm,Mabel Infoziant: ramisharanik@gmail.com
6,1:43 pm,Mabel Infoziant: Your email Id?
7,1:43 pm,AR❤: Yes Mam
8,1:43 pm,Mabel Infoziant: I will send 2 abstracts for ...
9,1:43 pm,AR❤: Yeah mam
10,1:43 pm,Mabel Infoziant: Give me the list that u have...


In [12]:
Message.columns = ['Time', 'Chat']
Message

Unnamed: 0,Time,Chat
1,1:42 pm,Mabel Infoziant: Hi this is Mabel we just spoke
2,1:42 pm,Mabel Infoziant: What’s your full name
3,1:42 pm,AR❤: Ramisha Rani K
4,1:42 pm,Mabel Infoziant: Ok
5,1:42 pm,Mabel Infoziant: ramisharanik@gmail.com
6,1:43 pm,Mabel Infoziant: Your email Id?
7,1:43 pm,AR❤: Yes Mam
8,1:43 pm,Mabel Infoziant: I will send 2 abstracts for ...
9,1:43 pm,AR❤: Yeah mam
10,1:43 pm,Mabel Infoziant: Give me the list that u have...


In [13]:
df

Unnamed: 0,Date,Chat
1,05/12/19,1:42 pm - Mabel Infoziant: Hi this is Mabel w...
2,05/12/19,1:42 pm - Mabel Infoziant: What’s your full name
3,05/12/19,1:42 pm - AR❤: Ramisha Rani K
4,05/12/19,1:42 pm - Mabel Infoziant: Ok
5,05/12/19,1:42 pm - Mabel Infoziant: ramisharanik@gmail...
6,05/12/19,1:43 pm - Mabel Infoziant: Your email Id?
7,05/12/19,1:43 pm - AR❤: Yes Mam
8,05/12/19,1:43 pm - Mabel Infoziant: I will send 2 abst...
9,05/12/19,1:43 pm - AR❤: Yeah mam
10,05/12/19,1:43 pm - Mabel Infoziant: Give me the list t...


In [14]:
df['Time'] = Message['Time']

In [15]:
df

Unnamed: 0,Date,Chat,Time
1,05/12/19,1:42 pm - Mabel Infoziant: Hi this is Mabel w...,1:42 pm
2,05/12/19,1:42 pm - Mabel Infoziant: What’s your full name,1:42 pm
3,05/12/19,1:42 pm - AR❤: Ramisha Rani K,1:42 pm
4,05/12/19,1:42 pm - Mabel Infoziant: Ok,1:42 pm
5,05/12/19,1:42 pm - Mabel Infoziant: ramisharanik@gmail...,1:42 pm
6,05/12/19,1:43 pm - Mabel Infoziant: Your email Id?,1:43 pm
7,05/12/19,1:43 pm - AR❤: Yes Mam,1:43 pm
8,05/12/19,1:43 pm - Mabel Infoziant: I will send 2 abst...,1:43 pm
9,05/12/19,1:43 pm - AR❤: Yeah mam,1:43 pm
10,05/12/19,1:43 pm - Mabel Infoziant: Give me the list t...,1:43 pm


In [16]:
Message1 = Message['Chat'].str.split(':', n=1, expand=True)

In [17]:
Message1

Unnamed: 0,0,1
1,Mabel Infoziant,Hi this is Mabel we just spoke
2,Mabel Infoziant,What’s your full name
3,AR❤,Ramisha Rani K
4,Mabel Infoziant,Ok
5,Mabel Infoziant,ramisharanik@gmail.com
6,Mabel Infoziant,Your email Id?
7,AR❤,Yes Mam
8,Mabel Infoziant,I will send 2 abstracts for u to start working
9,AR❤,Yeah mam
10,Mabel Infoziant,Give me the list that u have too


In [18]:
Message1.columns = ['Name','Chat']
Message1

Unnamed: 0,Name,Chat
1,Mabel Infoziant,Hi this is Mabel we just spoke
2,Mabel Infoziant,What’s your full name
3,AR❤,Ramisha Rani K
4,Mabel Infoziant,Ok
5,Mabel Infoziant,ramisharanik@gmail.com
6,Mabel Infoziant,Your email Id?
7,AR❤,Yes Mam
8,Mabel Infoziant,I will send 2 abstracts for u to start working
9,AR❤,Yeah mam
10,Mabel Infoziant,Give me the list that u have too


In [19]:
df['Name'] = Message1['Name']

In [20]:
df['Message'] = Message1['Chat']

In [21]:
df

Unnamed: 0,Date,Chat,Time,Name,Message
1,05/12/19,1:42 pm - Mabel Infoziant: Hi this is Mabel w...,1:42 pm,Mabel Infoziant,Hi this is Mabel we just spoke
2,05/12/19,1:42 pm - Mabel Infoziant: What’s your full name,1:42 pm,Mabel Infoziant,What’s your full name
3,05/12/19,1:42 pm - AR❤: Ramisha Rani K,1:42 pm,AR❤,Ramisha Rani K
4,05/12/19,1:42 pm - Mabel Infoziant: Ok,1:42 pm,Mabel Infoziant,Ok
5,05/12/19,1:42 pm - Mabel Infoziant: ramisharanik@gmail...,1:42 pm,Mabel Infoziant,ramisharanik@gmail.com
6,05/12/19,1:43 pm - Mabel Infoziant: Your email Id?,1:43 pm,Mabel Infoziant,Your email Id?
7,05/12/19,1:43 pm - AR❤: Yes Mam,1:43 pm,AR❤,Yes Mam
8,05/12/19,1:43 pm - Mabel Infoziant: I will send 2 abst...,1:43 pm,Mabel Infoziant,I will send 2 abstracts for u to start working
9,05/12/19,1:43 pm - AR❤: Yeah mam,1:43 pm,AR❤,Yeah mam
10,05/12/19,1:43 pm - Mabel Infoziant: Give me the list t...,1:43 pm,Mabel Infoziant,Give me the list that u have too


In [22]:
dataset = df.drop('Chat',axis=1)

In [23]:
dataset

Unnamed: 0,Date,Time,Name,Message
1,05/12/19,1:42 pm,Mabel Infoziant,Hi this is Mabel we just spoke
2,05/12/19,1:42 pm,Mabel Infoziant,What’s your full name
3,05/12/19,1:42 pm,AR❤,Ramisha Rani K
4,05/12/19,1:42 pm,Mabel Infoziant,Ok
5,05/12/19,1:42 pm,Mabel Infoziant,ramisharanik@gmail.com
6,05/12/19,1:43 pm,Mabel Infoziant,Your email Id?
7,05/12/19,1:43 pm,AR❤,Yes Mam
8,05/12/19,1:43 pm,Mabel Infoziant,I will send 2 abstracts for u to start working
9,05/12/19,1:43 pm,AR❤,Yeah mam
10,05/12/19,1:43 pm,Mabel Infoziant,Give me the list that u have too


In [24]:
data = dataset.to_csv('Prepro_whatsapp_data.csv',index=False)

In [25]:
data = pd.read_csv('Prepro_whatsapp_data.csv',index_col=None)
data 

Unnamed: 0,Date,Time,Name,Message
0,05/12/19,1:42 pm,Mabel Infoziant,Hi this is Mabel we just spoke
1,05/12/19,1:42 pm,Mabel Infoziant,What’s your full name
2,05/12/19,1:42 pm,AR❤,Ramisha Rani K
3,05/12/19,1:42 pm,Mabel Infoziant,Ok
4,05/12/19,1:42 pm,Mabel Infoziant,ramisharanik@gmail.com
5,05/12/19,1:43 pm,Mabel Infoziant,Your email Id?
6,05/12/19,1:43 pm,AR❤,Yes Mam
7,05/12/19,1:43 pm,Mabel Infoziant,I will send 2 abstracts for u to start working
8,05/12/19,1:43 pm,AR❤,Yeah mam
9,05/12/19,1:43 pm,Mabel Infoziant,Give me the list that u have too


In [26]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [27]:
data.dropna(inplace=True)

In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Date     50 non-null     object
 1   Time     50 non-null     object
 2   Name     50 non-null     object
 3   Message  50 non-null     object
dtypes: object(4)
memory usage: 1.7+ KB


In [29]:
# import nltk
# nltk.download('vader_lexicon')

In [30]:
sia = SentimentIntensityAnalyzer()                        # nltk.sentiment.vader.SentimentIntensityAnalyzer.polarity_scores
sentiment = sia.polarity_scores(data['Message'][10])  # Sending a particular text to check the polarity score

In [31]:
sentiment

{'neg': 0.0, 'neu': 0.915, 'pos': 0.085, 'compound': 0.0772}

In [32]:
sentiment['compound']

0.0772

In [33]:
def SentimentalAnalysis(data, column_name):

    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    sia = SentimentIntensityAnalyzer()   
    
    # Creating the respective columns
    # Forloop for a particular column - Message, apply forloop to all the rows in that column
    data['Scores'] = data[column_name].apply(lambda message : sia.polarity_scores(message))  
    data['Compound'] = data['Scores'].apply(lambda score_dict : score_dict['compound'])
    data['Posivite'] = data['Scores'].apply(lambda score_dict : score_dict['pos'])
    data['Negative'] = data['Scores'].apply(lambda score_dict : score_dict['neg'])
    data['Neutral'] = data['Scores'].apply(lambda score_dict : score_dict['neu'])
    
    data['Feedback'] = data['Compound'].apply(lambda comp: 'Positive Chat' if comp>0 else 'Negative Chat')

    PosNeg = pd.DataFrame(data['Feedback'].value_counts())
    return PosNeg,data

In [34]:
PosNeg,Senti_data = SentimentalAnalysis(data,column_name = 'Message')

In [35]:
Senti_data

Unnamed: 0,Date,Time,Name,Message,Scores,Compound,Posivite,Negative,Neutral,Feedback
0,05/12/19,1:42 pm,Mabel Infoziant,Hi this is Mabel we just spoke,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.0,1.0,Negative Chat
1,05/12/19,1:42 pm,Mabel Infoziant,What’s your full name,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.0,1.0,Negative Chat
2,05/12/19,1:42 pm,AR❤,Ramisha Rani K,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.0,1.0,Negative Chat
3,05/12/19,1:42 pm,Mabel Infoziant,Ok,"{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound...",0.296,1.0,0.0,0.0,Positive Chat
4,05/12/19,1:42 pm,Mabel Infoziant,ramisharanik@gmail.com,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.0,1.0,Negative Chat
5,05/12/19,1:43 pm,Mabel Infoziant,Your email Id?,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.0,1.0,Negative Chat
6,05/12/19,1:43 pm,AR❤,Yes Mam,"{'neg': 0.0, 'neu': 0.27, 'pos': 0.73, 'compou...",0.4019,0.73,0.0,0.27,Positive Chat
7,05/12/19,1:43 pm,Mabel Infoziant,I will send 2 abstracts for u to start working,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.0,1.0,Negative Chat
8,05/12/19,1:43 pm,AR❤,Yeah mam,"{'neg': 0.0, 'neu': 0.312, 'pos': 0.688, 'comp...",0.296,0.688,0.0,0.312,Positive Chat
9,05/12/19,1:43 pm,Mabel Infoziant,Give me the list that u have too,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.0,1.0,Negative Chat


In [36]:
PosNeg

Unnamed: 0_level_0,count
Feedback,Unnamed: 1_level_1
Negative Chat,28
Positive Chat,22


## Topic Modelling 
### Using TFIDF & LDA

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df = 0.95, min_df = 2, stop_words='english')
mess_vectors = tfidf.fit_transform(data['Message'])
print(mess_vectors)

  (0, 7)	0.7341176803977812
  (0, 6)	0.6790222612899971
  (2, 19)	1.0
  (3, 16)	1.0
  (6, 10)	0.432891758440903
  (6, 31)	0.9014459082351769
  (7, 24)	0.6118703395854738
  (7, 1)	0.6118703395854738
  (7, 21)	0.5012278674127307
  (8, 30)	0.9014459082351769
  (8, 10)	0.432891758440903
  (10, 25)	0.43695029836861626
  (10, 2)	0.404157245618755
  (10, 14)	0.404157245618755
  (10, 17)	0.43695029836861626
  (10, 15)	0.404157245618755
  (10, 21)	0.3579380336772536
  (11, 30)	0.7212292400644764
  (11, 10)	0.6926964582528322
  (12, 28)	0.4472135954999579
  (12, 15)	0.8944271909999159
  (13, 16)	1.0
  (14, 22)	0.353799448795111
  (14, 9)	0.353799448795111
  (14, 13)	0.3272468545286066
  :	:
  (33, 5)	0.5563622979371865
  (33, 27)	0.5563622979371865
  (33, 10)	0.267175935110481
  (36, 11)	0.7071067811865476
  (36, 23)	0.7071067811865476
  (37, 10)	0.505730692210292
  (37, 16)	0.8626914088806605
  (38, 13)	0.6790222612899971
  (38, 19)	0.7341176803977812
  (39, 12)	1.0
  (40, 10)	1.0
  (41, 16)	1.

In [42]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
lda_model = LDA(n_components = 5, random_state=42)
Topic_results = lda_model.fit_transform(mess_vectors)
Topic_results

array([[0.65994449, 0.08290249, 0.08289483, 0.0838449 , 0.09041329],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.10001807, 0.10165353, 0.10000977, 0.10002573, 0.5982929 ],
       [0.10000468, 0.59998022, 0.10000257, 0.10000666, 0.10000587],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.08833466, 0.46866872, 0.27155801, 0.08572211, 0.08571649],
       [0.0746833 , 0.07402273, 0.07340262, 0.07341444, 0.70447691],
       [0.08569067, 0.08568828, 0.6572307 , 0.08569633, 0.08569402],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.05991805, 0.76532192, 0.05829917, 0.05810249, 0.05835837],
       [0.08286264, 0.08286084, 0.66854438, 0.08286697, 0.08286517],
       [0.464407  , 0.08761158, 0.08542732, 0.27710416, 0.08544994],
       [0.10000468, 0.59998022, 0.10000257, 0.10000666, 0.10000587],
       [0.05207708, 0.79675956, 0.

In [44]:
for index,topic in enumerate(lda_model.components_):
    results = [tfidf.get_feature_names_out()[i] for i in topic.argsort()[-10:]]
print(results)

['details', 'yes', 'ask', 'mam', 'send', 'church', 'ramisha', 'hi', 'start', 'abstracts']


In [49]:
data['Topic'] = Topic_results.argmax(axis=1)

In [50]:
data

Unnamed: 0,Date,Time,Name,Message,Scores,Compound,Posivite,Negative,Neutral,Feedback,Topic
0,05/12/19,1:42 pm,Mabel Infoziant,Hi this is Mabel we just spoke,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.0,1.0,Negative Chat,0
1,05/12/19,1:42 pm,Mabel Infoziant,What’s your full name,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.0,1.0,Negative Chat,0
2,05/12/19,1:42 pm,AR❤,Ramisha Rani K,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.0,1.0,Negative Chat,4
3,05/12/19,1:42 pm,Mabel Infoziant,Ok,"{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound...",0.296,1.0,0.0,0.0,Positive Chat,1
4,05/12/19,1:42 pm,Mabel Infoziant,ramisharanik@gmail.com,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.0,1.0,Negative Chat,0
5,05/12/19,1:43 pm,Mabel Infoziant,Your email Id?,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.0,1.0,Negative Chat,0
6,05/12/19,1:43 pm,AR❤,Yes Mam,"{'neg': 0.0, 'neu': 0.27, 'pos': 0.73, 'compou...",0.4019,0.73,0.0,0.27,Positive Chat,1
7,05/12/19,1:43 pm,Mabel Infoziant,I will send 2 abstracts for u to start working,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.0,1.0,Negative Chat,4
8,05/12/19,1:43 pm,AR❤,Yeah mam,"{'neg': 0.0, 'neu': 0.312, 'pos': 0.688, 'comp...",0.296,0.688,0.0,0.312,Positive Chat,2
9,05/12/19,1:43 pm,Mabel Infoziant,Give me the list that u have too,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.0,1.0,Negative Chat,0


In [51]:
data['Topic'].value_counts()

Topic
2    16
0    14
1    10
4     6
3     4
Name: count, dtype: int64