In [1]:
# !pip install nltk
# !pip install pandas
# !pip install textblob

import pandas as pd
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import matplotlib.pyplot as plt

# Load Excel data into DataFrame
def load_data(file_path):
    df = pd.read_excel(file_path)
    return df

# Assign Sentiment Score
def assign_sentiment_score(df):
    def calculate_sentiment(message):
        if isinstance(message, str):
            return TextBlob(message).sentiment.polarity
        else:
            return 0  # Assign a neutral sentiment for non-string values
        
    df['Sentiment'] = df['Message'].apply(calculate_sentiment)
    return df


# Extract Positive, Negative, and Neutral Words
def extract_sentiment_words(df):
    positive_words = []
    negative_words = []
    neutral_words = []

    for message in df['Message']:
        if isinstance(message, str):  # Check if the message is a string
            words = word_tokenize(message)
            for word in words:
                if TextBlob(word).sentiment.polarity > 0:
                    positive_words.append(word)
                elif TextBlob(word).sentiment.polarity < 0:
                    negative_words.append(word)
                else:
                    neutral_words.append(word)

    return positive_words, negative_words, neutral_words


# Extract Common and Most Used Words
def extract_common_words(df):
    all_words = []

    for message in df['Message']:
        if isinstance(message, str):  # Check if the message is a string
            words = word_tokenize(message)
            all_words.extend(words)

    freq_dist = FreqDist(all_words)
    common_words = freq_dist.most_common(10)  # Change the number as needed

    return common_words


# Most Active User
def most_active_user(df):
    most_active_user = df['Username'].value_counts().idxmax()
    return most_active_user


# Per Day Messages by Each Member
def per_day_messages(df):
    per_day_messages = df.groupby(['Date', 'Username']).size().reset_index(name='MessageCount')
    return per_day_messages


# Highly Active Time of the Day
def hourly_activity(df):
    df['Time'] = df['Time'].astype(str)
    df = df[df['Time'] != 'nan']

    # Create a copy of the DataFrame
    df_copy = df.copy()

    df_copy.loc[:, 'Hour'] = df_copy['Time'].apply(lambda x: f"{int(x.split(':')[0]):02d} {x[-2:]}")

    # Group by 'Hour' and calculate the count of messages for each hour
    hourly_activity = df_copy.groupby('Hour').size().reset_index(name='Total Messages')

    # Create a new DataFrame with custom headings
    result = pd.DataFrame({'Hour': hourly_activity['Hour'], 'Total Messages': hourly_activity['Total Messages']})

    return result


# Main function
def main(file_path):
    df = load_data(file_path)
    
    #sentiment_score
    sentiment_score = assign_sentiment_score(df)
    total_messages = len(sentiment_score)
    positive_messages = sentiment_score[sentiment_score['Sentiment'] > 0]
    negative_messages = sentiment_score[sentiment_score['Sentiment'] < 0]
    neutral_messages = sentiment_score[sentiment_score['Sentiment'] == 0]
    
    percentage_positive = (len(positive_messages) / total_messages) * 100
    percentage_negative = (len(negative_messages) / total_messages) * 100
    percentage_neutral = (len(neutral_messages) / total_messages) * 100
    
    #positive_words, negative_words, neutral_words
    positive_words, negative_words, neutral_words = extract_sentiment_words(df)
    
    #common_words
    common_words = extract_common_words(df)
    
    #active_user
    active_user = most_active_user(df)
    
    #per_day_msgs
    per_day_msgs = per_day_messages(df)
    
    #hourly_act
    hourly_act = hourly_activity(df)
    
    
    
    print("\n ====================================================================")
    print("\t \t \t Sentiment Score")
    print(" ====================================================================")
    print(f"\t \t \t Positive: {percentage_positive:.2f}%")
    print(f"\t \t \t Negative: {percentage_negative:.2f}%")
    print(f"\t \t \t Neutral: {percentage_neutral:.2f}%")
    print("\n ====================================================================")
    print("\t \t \t Positive Words")
    print(" ====================================================================")
    print(positive_words)
    print("\n ====================================================================")
    print("\t \t \t Negative Words")
    print(" ====================================================================")
    print(negative_words)
    print("\n ====================================================================")
    print("\t \t \t Neutral Words")
    print(" ====================================================================")
    print(neutral_words)
    print("\n ====================================================================")
    print("\t \t \t Common Words")
    print(" ====================================================================")
    print(common_words)
    print("\n ====================================================================")
    print("\t \t \t Most Active User")
    print(" ====================================================================")
    print("\t \t \t", active_user)
    print("\n ====================================================================")
    print("\t \t \t Per Day Messages")
    print(" ====================================================================")
    print(per_day_msgs)
    print("\n ====================================================================")
    print("\t \t \t Hourly Activity")
    print(" ====================================================================")
    print(hourly_act)
    print("\n ====================================================================")
    
if __name__ == "__main__":
    import nltk
#     nltk.download('punkt')  # Download the tokenizer models
    
    file_path = "./Downloads/Whatsup_groupchat_statistics/Chat1.xlsx"
    main(file_path)



	 	 	 Sentiment Score
	 	 	 Positive: 23.83%
	 	 	 Negative: 5.96%
	 	 	 Neutral: 70.21%

	 	 	 Positive Words
['interesting', 'wins', 'real', 'real', 'better', 'free', 'becoming', 'Good', 'fine', 'more', 'fine', 'good', 'first', 'first', 'first', 'true', 'most', 'famous', 'win', 'new', 'Welcome', 'Much', 'Welcome', 'Welcome', 'Thanks', 'welcome', 'new', 'precious', 'right', 'good', 'fine', 'best', 'very', 'challenging', 'many', 'calm', 'very', 'very', 'grand', 'very', 'Fine', 'whole', 'busy', 'Okay', 'okay', 'Really', 'fine', 'Really', 'very', 'hot', 'Fine', 'fine', 'much', 'Thanks', 'Welcome', 'Okay', 'first', 'Good', 'Good', 'Good', 'Fine', 'Fine', 'Better', 'confirmed', 'right', 'busy', 'Okay', 'fine', 'fine', 'Outstanding', 'Fine', 'Fine']

	 	 	 Negative Words
['game', 'other', 'mean', 'hard', 'impossible', 'other', 'down', 'other', 'horrible', 'killed', 'horrible', 'badly', 'sad', 'bad', 'tired', 'destroy', 'broken', 'few', 'long']

	 	 	 Neutral Words
['\u200eMessages', 'and',