In [1]:
import re
import pandas as pd
from collections import Counter
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report

In [2]:
def load_chat_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return lines

def parse_messages(lines):
    data = []
    for line in lines:
        match = re.match(r'^(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2} ?[APap][Mm]) - ([^:]+): (.*)', line)
        if match:
            date, time, user, message = match.groups()
            data.append([date, time, user.strip(), message.strip()])
    
    df = pd.DataFrame(data, columns=['Date', 'Time', 'User', 'Message'])
    df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%y', errors='coerce')
    
    return df

In [3]:
def get_message_counts(df):
    return df.groupby('Date').size()

In [4]:
def get_most_talkative_users(df):
    return df['User'].value_counts()

In [5]:
def analyze_activity(df):
    df = df.dropna(subset=['Date'])  
    if df.empty:
        return None, None, 0  
    
    df['DayOfWeek'] = df['Date'].dt.day_name()
    most_active_date = df['Date'].value_counts().idxmax()
    most_active_day = df['DayOfWeek'].value_counts().idxmax()
    avg_messages_per_day = df.groupby('Date').size().mean()
    return most_active_date, most_active_day, avg_messages_per_day

In [6]:
def media_statistics(df):
    media_count = df['Message'].str.contains('<Media omitted>').sum()
    deleted_count = df['Message'].str.contains('This message was deleted').sum()
    missed_voice_calls = df['Message'].str.contains('Missed voice call').sum()
    missed_video_calls = df['Message'].str.contains('Missed video call').sum()
    return media_count, deleted_count, missed_voice_calls, missed_video_calls

In [7]:
def train_flirt_model(df, labeled_data):
    if labeled_data.empty:
        print("Insufficient labeled data for training.")
        return df
    
    vectorizer = TfidfVectorizer(stop_words='english')
    X_train, X_test, y_train, y_test = train_test_split(labeled_data['Message'], labeled_data['Label'], test_size=0.2, random_state=42, stratify=labeled_data['Label'])
    
    model = make_pipeline(vectorizer, MultinomialNB())
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    print("Model Accuracy:", accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions))
    df['Flirt'] = model.predict(df['Message'])
    return df


In [8]:
def flirt_statistics(df):
    if 'Flirt' not in df.columns:
        print("Flirt prediction not performed.")
        return pd.Series()
    
    flirt_counts = df[df['Flirt'] == 1]['User'].value_counts()
    total_counts = df['User'].value_counts()
    flirt_percentage = (flirt_counts / total_counts * 100).fillna(0)
    return flirt_percentage

In [9]:
filename = 'office_group.txt'
chat_lines = load_chat_data(filename)
df = parse_messages(chat_lines)
print("Date-wise Message Count:\n", get_message_counts(df))
print("Most Talkative Users:\n", get_most_talkative_users(df))
most_active_date, most_active_day, avg_msgs = analyze_activity(df)
print(f"Most Active Date: {most_active_date}, Most Active Day: {most_active_day}, Avg Messages per Day: {avg_msgs}")
media_stats = media_statistics(df)
print(f"Media Stats - Media: {media_stats[0]}, Deleted: {media_stats[1]}, Missed Voice: {media_stats[2]}, Missed Video: {media_stats[3]}")


labeled_data = pd.DataFrame({
    "Message": [
        "You have a beautiful smile 😍",  # Flirt (1)
        "Good morning, team!",  # Non-Flirt (0)
        "Let's meet for coffee 😉",  # Flirt (1)
        "Please share the report.",  # Non-Flirt (0)
        "I love your eyes ❤️",  # Flirt (1)
        "The meeting is at 3 PM.",  # Non-Flirt (0)
        "You're so charming 😊",  # Flirt (1)
        "Send me the project file.",  # Non-Flirt (0)
    ],
    "Label": [1, 0, 1, 0, 1, 0, 1, 0]
})

df = train_flirt_model(df, labeled_data)
print("Flirt Statistics:\n", flirt_statistics(df))


Date-wise Message Count:
 Date
2023-09-17    72
2023-09-18    19
2023-09-19    64
2023-09-20     1
2023-09-22    22
              ..
2025-02-01    25
2025-02-03     2
2025-02-05     1
2025-02-06    45
2025-02-07     1
Length: 249, dtype: int64
Most Talkative Users:
 User
Mukundan_CEO Vcare      283
Preethi Clinic Vcare    228
Kumar IT Vcare          205
Vinoth Sankar           184
Wasim Vcare Imports     155
                       ... 
+91 93603 10822           1
+91 73393 35564           1
+91 90032 92429           1
+91 89258 03279           1
+91 80156 90056           1
Name: count, Length: 175, dtype: int64
Most Active Date: 2024-01-06 00:00:00, Most Active Day: Saturday, Avg Messages per Day: 24.17269076305221
Media Stats - Media: 538, Deleted: 99, Missed Voice: 0, Missed Video: 0
Model Accuracy: 0.5
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
