In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np

# Load data
df = pd.read_csv(r'D:\Personalized_Chatbot\data\raw\personality.csv')

# Basic exploration
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nData Types:\n", df.dtypes)
print("\nMissing Values:\n", df.isnull().sum())

Dataset Shape: (8939, 3)

Columns: ['Unnamed: 0', 'Persona', 'chat']

Data Types:
 Unnamed: 0     int64
Persona       object
chat          object
dtype: object

Missing Values:
 Unnamed: 0    0
Persona       0
chat          0
dtype: int64


## Personality Analysis

In [2]:
# Personality distribution analysis
def analyze_personalities(df):
    # If personality traits are in columns
    personality_cols = [col for col in df.columns if 'personality' in col.lower()]
    
    # Distribution of personality types
    plt.figure(figsize=(12, 6))
    for i, col in enumerate(personality_cols[:4], 1):
        plt.subplot(2, 2, i)
        df[col].value_counts().head(10).plot(kind='bar')
        plt.title(f'Top 10 {col}')
        plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

 ## Conversation Statistics

In [3]:
def conversation_analysis(df):
    # Assuming conversation text is in a column like 'text', 'message', 'dialog'
    text_col = [col for col in df.columns if any(x in col.lower() for x in ['text', 'message', 'dialog'])][0]
    
    # Length analysis
    df['text_length'] = df[text_col].astype(str).apply(len)
    df['word_count'] = df[text_col].astype(str).apply(lambda x: len(x.split()))
    
    # Visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Text length distribution
    axes[0,0].hist(df['text_length'], bins=50, alpha=0.7)
    axes[0,0].set_title('Distribution of Text Length')
    axes[0,0].set_xlabel('Character Count')
    
    # Word count distribution
    axes[0,1].hist(df['word_count'], bins=50, alpha=0.7, color='orange')
    axes[0,1].set_title('Distribution of Word Count')
    axes[0,1].set_xlabel('Word Count')
    
    # Top words
    all_words = ' '.join(df[text_col].astype(str)).split()
    word_freq = Counter(all_words)
    top_words = pd.DataFrame(word_freq.most_common(20), columns=['word', 'count'])
    axes[1,0].barh(range(20), top_words['count'])
    axes[1,0].set_yticks(range(20))
    axes[1,0].set_yticklabels(top_words['word'])
    axes[1,0].set_title('Top 20 Most Frequent Words')
    
    # Conversation patterns by personality
    if 'personality' in df.columns:
        personality_word_counts = df.groupby('personality')['word_count'].mean()
        axes[1,1].bar(range(len(personality_word_counts)), personality_word_counts.values)
        axes[1,1].set_title('Average Word Count by Personality')
        axes[1,1].set_xticks(range(len(personality_word_counts)))
        axes[1,1].set_xticklabels(personality_word_counts.index, rotation=45)
    
    plt.tight_layout()
    plt.show()

## Temporal Analysis (if timestamps exist)

In [4]:
def temporal_analysis(df):
    date_cols = [col for col in df.columns if 'date' in col.lower() or 'time' in col.lower()]
    if date_cols:
        df[date_cols[0]] = pd.to_datetime(df[date_cols[0]])
        df['hour'] = df[date_cols[0]].dt.hour
        df['day_of_week'] = df[date_cols[0]].dt.day_name()
        
        # Activity by hour
        plt.figure(figsize=(12, 4))
        plt.subplot(1, 2, 1)
        df['hour'].value_counts().sort_index().plot(kind='bar')
        plt.title('Message Frequency by Hour')
        
        plt.subplot(1, 2, 2)
        df['day_of_week'].value_counts().plot(kind='bar')
        plt.title('Message Frequency by Day of Week')
        plt.show()

## Personality-Text Correlation

In [5]:
def personality_correlations(df):
    # Analyze how personality affects conversation style
    text_col = [col for col in df.columns if any(x in col.lower() for x in ['text', 'message'])][0]
    
    # Create features for correlation
    df['contains_questions'] = df[text_col].str.contains('\?')
    df['exclamation_count'] = df[text_col].str.count('!')
    df['question_count'] = df[text_col].str.count('\?')
    
    # Group by personality and analyze patterns
    if 'personality' in df.columns:
        personality_stats = df.groupby('personality').agg({
            'word_count': 'mean',
            'contains_questions': 'mean',
            'exclamation_count': 'mean',
            'question_count': 'mean'
        })
        
        print("Conversation Patterns by Personality:")
        print(personality_stats)