## Importing libraries

In [None]:
import re
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter, defaultdict
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from itertools import combinations

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF
from sklearn.metrics import pairwise_distances_argmin_min, silhouette_score

import warnings
warnings.filterwarnings('ignore')

### Extracting date time, contact-name, and message from the chat logs

In [None]:
def date_time(s):
    pattern = '^([0-9]+(\/)([0-9]+)(\/)[0-9]+, ([0-9]+):([0-9]+)\s(PM|AM|am|pm) - )'
    result = re.match(pattern, s)
    if result:
        return True 
    return False

def contact(s):
    s=s.split(":")
    if len(s) == 2:
        return True 
    return False

def getmsg(line):
    splitline = line.split(' - ')
    date, time = splitline[0].split(', ')
    msg = " ".join(splitline[1:])
   
    if contact(msg):
        split_msg = msg.split(': ')
        author = split_msg[0]
        msg = " ".join(split_msg[1:])
    else:
        author = None
    return date, time, author, msg


In [None]:
data = []
conversation = 'chats/WhatsApp Chat with Vishal Sir.txt'
with open(conversation, encoding="utf-8") as fp:
    fp.readline()
    msgBuffer = []
    date, time, author=None, None, None
    while True:
        line = fp.readline()
        if not line:
            break
        line = line.strip()

        if date_time(line):
            if len(msgBuffer) > 0 :
                data.append([date, time, author, " ".join(msgBuffer)])
            msgBuffer.clear()
            date, time, author, msg = getmsg(line)
            msgBuffer.append(msg)
        else:
            msgBuffer.append(line)

In [None]:
msgBuffer

### Converting the extracted data into DataFrame

In [None]:
data = pd.DataFrame(data, columns=["Date", "Time", "Contact", "Message"])
data['Date'] = pd.to_datetime(data['Date'])
data = data.dropna()
data

## Cleaning the Data

In [None]:
stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
              "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself",
              "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these",
              "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do",
              "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while",
              "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before",
              "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again",
              "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each",
              "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than",
              "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

### Cleaning the deleted messages and media ommitio logs


In [None]:
data["Message"] = data['Message'][data["Message"] != "<Media omitted>"]
data.dropna(axis=0, inplace=True)

string_to_match = " deleted this message"
data = data[~data["Message"].str.contains(string_to_match, case=False)]
data


### Cleaning the messages from punctuations and stopwords and tokenized the messages

In [None]:
data['Cleaned_message'] = data["Message"].apply(lambda x: x.lower().translate(str.maketrans(' ', " ", string.punctuation)))

data['Tokenized_words'] = data["Cleaned_message"].apply(lambda y: [word for word in word_tokenize(y) if word not in STOPWORDS and word not in stop_words] )

In [None]:
data

### Applying the lemmatization techniques

In [None]:
lemmatizer = WordNetLemmatizer()
data["Lemmatized"] = [[lemmatizer.lemmatize(token) for token in token_list] for token_list in data["Tokenized_words"]]


## Sentiment analysis

### Sentiments of each message

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiments = SentimentIntensityAnalyzer()

data['Positive'] = [sentiments.polarity_scores(i)['pos'] for i in data["Message"]]
data['Negative'] = [sentiments.polarity_scores(i)['neg'] for i in data["Message"]]
data['Neutral'] = [sentiments.polarity_scores(i)['neu'] for i in data["Message"]]

### Normalizing the values of the sentiment Analyzer

In [None]:
data["Positive"] = data["Positive"].apply(lambda x: np.ceil(x) if x - np.floor(x) >= 0.5 else np.floor(x))
data["Negative"] = data["Negative"].apply(lambda x: np.ceil(x) if x - np.floor(x) >= 0.5 else np.floor(x))
data["Neutral"] = data["Neutral"].apply(lambda x: np.ceil(x) if x - np.floor(x) >= 0.5 else np.floor(x))


In [None]:
data

### Over all sentiment of the chat

In [None]:
pos = sum(data['Positive'])
neg = sum(data['Negative'])
neu = sum(data['Neutral'])

def score(a,b,c):
    if a>b and a>c:
        print('postive')
    elif b>a and b>c:
        print('negative')
    else:
        print("neutral")

score(pos, neg, neu)

pos, neg, neu

### Added the sentiment in text form

In [None]:
def compare_values(row):
    if row['Positive'] > row['Negative'] and row['Positive'] > row['Neutral']:
        return 'positive'
    elif row['Negative'] > row['Positive'] and row['Negative'] > row['Neutral']:
        return 'negative'
    else:
        return 'neutral'

In [None]:
data["sentiment"] = data.apply(compare_values, axis = 1)
data

### Extracting the sentiment trends into seperate dataframe

In [None]:
grouped = data.groupby('Date')

def most_common_sentiment(series):
    return Counter(series).most_common(1)[0][0]

# Create a new DataFrame with the required columns
sentiment_logs = grouped.agg(
    start_sentiment=('sentiment', 'first'),
    end_sentiment=('sentiment', 'last'),
    most_common_sentiment=('sentiment', most_common_sentiment)
).reset_index()

# Print the result DataFrame
print(sentiment_logs)

### Extracted sentiment at start and end of the conversation

In [None]:
grouped = data.groupby('Date')

# Add columns for the first and last Sentiment values within each group
data['Start Conversation'] = grouped['sentiment'].transform('first')
data['Stop Conversation'] = grouped['sentiment'].transform('last')
data

## Analysing the word frequencies

### Frequency of each word

In [None]:
combined_words = [word for sublist in data['Lemmatized'] for word in sublist]

fdist = nltk.FreqDist(combined_words)

### Most common words

In [None]:
print("The most common 10 words are: \n")

for i,j in fdist.most_common(10):
    print(i, end=", ")


## Topic modeling

In [None]:
data["Tokenized_mgs"] = data["Tokenized_words"].apply(lambda x: " ".join(x))
data

### Latent Dirichlet Allocation (LDA)

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['Tokenized_mgs'])


lda_model = LatentDirichletAllocation(n_components=5, random_state=0)
lda_model.fit(X)


def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx}:")
        print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-top_n - 1:-1]])

print_topics(lda_model, vectorizer)


topic_distribution = lda_model.transform(X)


def most_relevant(topic_distribution):
    most_relevant_topics = []
    for distribution in topic_distribution:
        most_relevant_topic = distribution.argmax()
        most_relevant_topics.append(most_relevant_topic)

    return most_relevant_topics

relevant_topics = most_relevant(topic_distribution)


data['LDA'] = relevant_topics

### Latent Semantic Analysis (LSA)

In [None]:
tfvectorizer = TfidfVectorizer()
X = tfvectorizer.fit_transform(data["Tokenized_mgs"])


lsa_model = TruncatedSVD(n_components=5, random_state=0)
lsa_model.fit(X)


def print_topics(model, vectorizer, top_n=10):
    terms = vectorizer.get_feature_names_out()
    for idx, component in enumerate(model.components_):
        terms_in_topic = [terms[i] for i in component.argsort()[:-top_n - 1:-1]]
        print(f"Topic {idx}: {' '.join(terms_in_topic)}")

print_topics(lsa_model, tfvectorizer)


topic_distribution = lsa_model.transform(X)


def most_relevant(topic_distribution):
    most_relevant_topics = []
    for distribution in topic_distribution:
        # Get the topic with the highest value
        most_relevant_topic = distribution.argmax()
        most_relevant_topics.append(most_relevant_topic)
    return most_relevant_topics


relevant_topics = most_relevant(topic_distribution)


data['LSA'] = relevant_topics

### Non-negative Matrix Factorization (NMF)

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data["Tokenized_mgs"])


nmf_model = NMF(n_components=5, random_state=0)
W = nmf_model.fit_transform(X)
H = nmf_model.components_


def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx}:")
        print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-top_n - 1:-1]])

print_topics(nmf_model, vectorizer)


def most_relevant(W):
    most_relevant_topics = []
    for distribution in W:
        most_relevant_topic = distribution.argmax()
        most_relevant_topics.append(most_relevant_topic)
    return most_relevant_topics


relevant_topics = most_relevant(W)


data['NMF'] = relevant_topics

### Probabilistic Latent Semantic Analysis (pLSA)

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data["Tokenized_mgs"])


plsa_model = LatentDirichletAllocation(n_components=5, doc_topic_prior=0.1, topic_word_prior=0.1, random_state=0)
plsa_model.fit(X)


def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx}:")
        print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-top_n - 1:-1]])

print_topics(plsa_model, vectorizer)


def most_relevant(topic_distribution):
    most_relevant_topics = []
    for distribution in topic_distribution:
        # Get the topic with the highest probability
        most_relevant_topic = distribution.argmax()
        most_relevant_topics.append(most_relevant_topic)
    return most_relevant_topics


topic_distribution = plsa_model.transform(X)


relevant_topics = most_relevant(topic_distribution)


data['pLSA'] = relevant_topics

In [None]:
data

In [None]:
data.info()

### Comparing all the topic modelings

### Coherence scores 

In [None]:
def print_topics(model, vectorizer, top_n=10):
    terms = vectorizer.get_feature_names_out()
    for idx, component in enumerate(model.components_):
        terms_in_topic = [terms[i] for i in component.argsort()[:-top_n - 1:-1]]
        print(f"Topic {idx}: {' '.join(terms_in_topic)}")


def coherence_score(model, vectorizer, documents, top_n=10):
    topics = model.components_
    words = vectorizer.get_feature_names_out()
    co_occurrences = defaultdict(int)
    for topic in topics:
        top_words = [words[i] for i in topic.argsort()[:-top_n - 1:-1]]
        for word_pair in combinations(top_words, 2):
            co_occurrences[word_pair] += sum(1 for doc in documents if word_pair[0] in doc and word_pair[1] in doc)
    coherence = sum(co_occurrences.values()) / len(co_occurrences)
    return coherence

# Compute coherence scores
lda_coherence = coherence_score(lda_model, tfvectorizer, data["Tokenized_mgs"])
nmf_coherence = coherence_score(nmf_model, vectorizer, data["Tokenized_mgs"])
plsa_coherence = coherence_score(plsa_model, tfvectorizer, data["Tokenized_mgs"])
lsa_coherence = coherence_score(lsa_model, vectorizer, data["Tokenized_mgs"])

print("\nCoherence Scores:")
print(f"LDA Coherence: {print_topics(lda_model, tfvectorizer)}\n")
print(f"NMF Coherence: {print_topics(nmf_model, vectorizer)}\n")
print(f"PLSA Coherence: {print_topics(plsa_model,tfvectorizer)}\n")
print(f"LSA Coherence: {print_topics(lsa_model, vectorizer)}\n")

print("\nCoherence Scores:")
print(f"LDA Coherence: {lda_coherence}")
print(f"NMF Coherence: {nmf_coherence}")
print(f"PLSA Coherence: {plsa_coherence}")
print(f"LSA Coherence: {lsa_coherence}")

### Topic diversity

In [None]:
def topic_diversity(topics, top_n=10):
    unique_words = set()
    total_words = 0
    for topic in topics:
        top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-top_n - 1:-1]]
        unique_words.update(top_words)
        total_words += len(top_words)
    return len(unique_words) / total_words

lda_diversity = topic_diversity(lda_model.components_)
nmf_diversity = topic_diversity(nmf_model.components_)
plsa_diversity = topic_diversity(plsa_model.components_)
lsa_diversity = topic_diversity(lsa_model.components_)

print(f"LDA Diversity: {lda_diversity}")
print(f"NMF Diversity: {nmf_diversity}")
print(f"pLSA Diversity: {plsa_diversity}")
print(f"LSA Diversity: {lsa_diversity}")



## Visualizations

### Generating word clouds from the frequencies of each word

In [None]:
wordcloud = WordCloud().generate_from_frequencies(fdist)

In [None]:
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")  # Hide axis
plt.show()


### Topic visualizations

In [None]:
num_topics = 5

vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(data['Tokenized_mgs'])

# Create a Document-Term Matrix using CountVectorizer for LDA
count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(data["Tokenized_mgs"])

# Create Word Clouds
def create_word_cloud(model, vectorizer, num_topics):
    terms = vectorizer.get_feature_names_out()
    for idx, component in enumerate(model.components_):
        word_freq = {terms[i]: component[i] for i in component.argsort()[:-11:-1]}
        wordcloud = WordCloud(width=400, height=200, background_color='white').generate_from_frequencies(word_freq)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.title(f"Topic {idx}")
        plt.show()

print("\nWord Clouds for LDA:")
create_word_cloud(lda_model, count_vectorizer, num_topics)

print("\nWord Clouds for NMF:")
create_word_cloud(nmf_model, vectorizer, num_topics)

print("\nWord Clouds for pLSA:")
create_word_cloud(plsa_model, count_vectorizer, num_topics)

print("\nWord Clouds for LSA:")
create_word_cloud(lsa_model, vectorizer, num_topics)

# Create Bar Charts
def create_bar_chart(model, vectorizer, num_topics, top_n=10):
    terms = vectorizer.get_feature_names_out()
    for idx, component in enumerate(model.components_):
        top_terms = [(terms[i], component[i]) for i in component.argsort()[:-top_n - 1:-1]]
        df_top_terms = pd.DataFrame(top_terms, columns=['Term', 'Weight'])
        plt.figure(figsize=(10, 5))
        sns.barplot(x='Weight', y='Term', data=df_top_terms)
        plt.title(f"Top Terms for Topic {idx}")
        plt.show()

print("\nBar Charts for LDA:")
create_bar_chart(lda_model, count_vectorizer, num_topics)

print("\nBar Charts for NMF:")
create_bar_chart(nmf_model, vectorizer, num_topics)

print("\nBar Charts for pLSA:")
create_bar_chart(plsa_model, count_vectorizer, num_topics)

print("\nBar Charts for LSA:")
create_bar_chart(lsa_model, vectorizer, num_topics)


### Visualizing sentiments over time 

In [None]:
sid = SentimentIntensityAnalyzer()

# Analyze sentiment
def get_sentiment(text):
    sentiment = sid.polarity_scores(text)
    return sentiment['compound']

data['sentiment_compound'] = data['Tokenized_mgs'].apply(get_sentiment)

# Aggregate sentiment scores by date
data["Date"] = pd.to_datetime(data['Date'])
data['Date'] = data['Date'].dt.date
daily_sentiment = data.groupby('Date')['sentiment_compound'].mean().reset_index()

# Plot sentiment trends
plt.figure(figsize=(10, 5))
sns.lineplot(data=daily_sentiment, x='Date', y='sentiment_compound', marker='o')
plt.title('Sentiment Trend')
plt.xlabel('Date')
plt.ylabel('Average Sentiment')
plt.grid(True)
plt.show()

In [None]:
def get_sentiment(text):
    sentiment = sid.polarity_scores(text)
    return sentiment

sentiments = data['Tokenized_mgs'].apply(get_sentiment)
df = pd.concat([data, sentiments.apply(pd.Series)], axis=1)

# Aggregate sentiment scores by date
data["Date"] = pd.to_datetime(data['Date'])
data['Date'] = data['Date'].dt.date
daily_sentiment = df.groupby('Date')[['pos', 'neu', 'neg', 'compound']].mean().reset_index()

# Plot sentiment trends
plt.figure(figsize=(14, 7))
plt.plot(daily_sentiment['Date'], daily_sentiment['pos'], marker='o', label='Positive', color='green')
plt.plot(daily_sentiment['Date'], daily_sentiment['neu'], marker='o', label='Neutral', color='blue')
plt.plot(daily_sentiment['Date'], daily_sentiment['neg'], marker='o', label='Negative', color='red')
plt.plot(daily_sentiment['Date'], daily_sentiment['compound'], marker='o', label='Compound', color='purple')
plt.title('Sentiment Trend Over Time')
plt.xlabel('Date')
plt.ylabel('Sentiment Score')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
message_counts = data.groupby('Date').size().reset_index(name='Message Count')

# Plot message volumes over time
plt.figure(figsize=(14, 7))
sns.lineplot(data=message_counts, x='Date', y='Message Count', marker='o')
plt.title('Message Volumes Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Messages')
plt.grid(True)
plt.show()