In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import textstat
from textblob import TextBlob
from collections import Counter
from textblob import TextBlob

In [2]:
def preprocess_text(caption):
    # Remove punctuation, numbers, and special characters, keeping only words
    caption = re.sub(r'[^a-zA-Z\s]', '', caption)
    # Convert to lowercase
    caption = caption.lower()
    # Remove extra whitespace
    caption = ' '.join(caption.split())
    return caption

def countWords(caption):
    wordList = caption.split(' ')
    return len(wordList)

def characterCount(caption):
    text = caption.replace(" ", "")
    characters = [char for char in text]
    return len(characters)

def sentenceCount(caption):
    # Using regular expression to split the caption into sentences
    sentences = re.split(r'[.!?]', caption)

    sentences = [sentence for sentence in sentences if sentence.strip()]

    return len(sentences)

def calculateAvgWordLen(caption):
    wordList = caption.split(' ')
    totalLen = 0
    for word in wordList:
        totalLen += len(word)
    
    return totalLen/len(wordList)

def calculateAvgSentLen(caption):
    sentences = re.split(r'[.!?]', caption)
    wps = 0
    for sentence in sentences:
        if sentence.strip():
            words = sentence.split()
            wps += len(words)
    
    return wps/len(sentences)

In [3]:
import spacy

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

def extract_pos_tags(caption):
    # Process the caption
    doc = nlp(caption)

    # Extract the POS tags
    pos_tags = [(token.text, token.pos_) for token in doc]

    return pos_tags

def extract_named_entity(caption):
    # Process the caption
    doc = nlp(caption)

    # Extract named entities
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    return entities

def rearrangePosTags(captions):
    tags = {}
    for caption in captions:
        for word, pos_tag in caption:
            if pos_tag in tags:
                tags[pos_tag] += 1
            else:
                tags[pos_tag] = 1
    
    return tags

def rearrangeNamedEntityTags(captions):
    tags = {}
    for caption in captions:
        for name, entity in caption:
            if entity in tags:
                tags[entity] += 1
            else:
                tags[entity] = 1
    
    return tags

In [4]:
def calculate_sentiment(captions):
    scores = []
    for caption in captions:
        blob = TextBlob(caption)
        sentiment = blob.sentiment.polarity
        # Normalize sentiment score from range [-1, 1] to [0, 1]
        normalized_sentiment = (sentiment + 1) / 2
        scores.append(normalized_sentiment)
    
    normalized_score = np.mean(scores)
    return normalized_score

def calculate_subjectivity(caption):
    blob = TextBlob(caption)
    subjectivity_scores = blob.sentiment.subjectivity

    return subjectivity_scores

def calculate_readability_scores(captions):
    readability_scores = []
    for caption in captions:
        reading_ease = textstat.flesch_reading_ease(caption)

        # Ensure scores are within the 0-100 range
        if reading_ease < 0:
            reading_ease = 0
        elif reading_ease > 100:
            reading_ease = 100

        # Normalize the readability score to 0-1 range
        normalized_reading_ease = reading_ease / 100
        readability_scores.append(normalized_reading_ease)

    reading_ease_score_mean = np.mean(readability_scores)
        
    return reading_ease_score_mean

def calculate_lexical_diversity(captions):
    scores = []
    for caption in captions:
        tokens = caption.split()
        num_tokens = len(tokens)
        num_types = len(set(tokens))
        ttr = num_types / num_tokens if num_tokens != 0 else 0
        scores.append(ttr)
    
    score = np.mean(scores)

    return score

# Function to calculate overall quality score
def calculate_overall_quality(readability, lexical_diversity, sentiment, weights=(0.4, 0.3, 0.3)):
    readability_weight, lexical_diversity_weight, sentiment_weight = weights
    overall_score = (
        readability_weight * readability +
        lexical_diversity_weight * lexical_diversity +
        sentiment_weight * sentiment
    )
    return overall_score

def get_overall_captions_quality(captions):
    # Calculate overall quality scores for each caption
    overall_quality_scores = calculate_overall_quality(calculate_readability_scores(captions), calculate_lexical_diversity(captions), calculate_sentiment(captions))

    # Calculate the final overall quality score for the influencer
    final_overall_quality_score = np.mean(overall_quality_scores)

    return final_overall_quality_score

In [5]:

import re
import os
import numpy as np

basic_features = []

categories = [
    'Automobile',
    'Beverages',
    'Clothing',
    'Electronics',
    'Entertainment',
    'Food',
    'Jewellery',
    'Makeup',
    'Non-profit',
    'Shoes',
]
quadrant = [
    'HF-HE',
    'HF-LE',
    'LF-HE',
    'LF-LE',
]
for category in categories:
    for quad in quadrant: 

        caption_folder_path = f'D://Study//Project//Major project//Bio analysis//{category}//{category}_{quad}_biography.xlsx'

        df = pd.read_excel(caption_folder_path)

        preprocessedCaptions = df['Captions'].apply(preprocess_text)

        # COUNT NUMBER OF WORDS IN CAPTIONS
        captionsWords = preprocessedCaptions.apply(countWords)

        # COUNT NUMBER OF CHARACTERS IN CAPTIONS
        captionsChars = preprocessedCaptions.apply(characterCount)

        # COUNT NUMBER OF SENTENCES IN CAPTIONS
        sentCount =  df['Captions'].apply(sentenceCount)

        # COUNT AVERAGE WORD LENGTHS
        averageWordLen = preprocessedCaptions.apply(calculateAvgWordLen)

        # COUNT AVERAGE SENTENCE LENGTH --> NUMBER OF WORD PER SENTENCE
        averageSentLen = df['Captions'].apply(calculateAvgSentLen)

        # EXTRACT THE POS TAGS
        pos_tags = df['Captions'].apply(extract_pos_tags)

        # EXTRACT NAME-ENTITY
        named_entity_tags = preprocessedCaptions.apply(extract_named_entity)

        top_50_cap_pos_tags = rearrangePosTags(pos_tags)
        top_50_cap_named_entity_tags = rearrangeNamedEntityTags(named_entity_tags)

        subjectivity_score = df['Captions'].apply(calculate_subjectivity)

        avgCapWords = captionsWords.mean()
        avgCapChar = captionsChars.mean()
        avgCapSentCount = sentCount.mean()
        avgCapAvgWordLen = averageWordLen.mean()
        avgCapAvgSentLen = averageSentLen.mean()
        avgCapSubjScore = subjectivity_score.mean()

        read_ease_score = calculate_readability_scores(list(df['Captions']))
        lexical_diversity_score = calculate_lexical_diversity(list(df['Captions']))
        sentiment_score = calculate_sentiment(list(df['Captions']))
        overall_score = get_overall_captions_quality(list(df['Captions']))

        curr = {
            'Quadrant': quad,
            'Category': category,
            'read_ease_score': read_ease_score,
            'lexical_diversity_score': lexical_diversity_score,
            'sentiment_score': sentiment_score,
            'overall_caption_score': overall_score,
            'Average Words': avgCapWords,
            'Average Characters': avgCapChar,
            'Average Sentence Count': avgCapSentCount,
            'Average Word Length': avgCapAvgWordLen,
            'Average Sentence Length': avgCapAvgSentLen,
            'Top 50 Captions Pos Tags': top_50_cap_pos_tags,
            'Top 50 Captions Named Entity Tags': top_50_cap_named_entity_tags,
            'Average Subjectivity Score': avgCapSubjScore
        }

        basic_features.append(curr)

In [6]:
caption_quality_score_df2 = pd.DataFrame(basic_features)
caption_quality_score_df2.head()

Unnamed: 0,Quadrant,Category,read_ease_score,lexical_diversity_score,sentiment_score,overall_caption_score,Average Words,Average Characters,Average Sentence Count,Average Word Length,Average Sentence Length,Top 50 Captions Pos Tags,Top 50 Captions Named Entity Tags,Average Subjectivity Score
0,HF-HE,Automobile,0.417992,0.926766,0.620637,0.631418,41.425287,251.816092,3.735632,6.13035,12.017226,"{'VERB': 400, 'ADP': 394, 'DET': 299, 'PROPN':...","{'ORDINAL': 9, 'FAC': 4, 'GPE': 16, 'PERSON': ...",0.461825
1,HF-LE,Automobile,0.438019,0.905451,0.609179,0.629597,45.982544,283.620116,4.154613,6.371646,11.382521,"{'NOUN': 15045, 'PART': 1114, 'VERB': 6327, 'A...","{'NORP': 130, 'DATE': 508, 'PERSON': 645, 'ORG...",0.42452
2,LF-HE,Automobile,0.407392,0.903653,0.63311,0.623986,43.768031,269.62963,4.364522,6.603392,9.952685,"{'PROPN': 4464, 'VERB': 2605, 'PRON': 1565, 'A...","{'GPE': 164, 'DATE': 209, 'TIME': 34, 'ORG': 1...",0.469017
3,LF-LE,Automobile,0.317371,0.912266,0.659237,0.598399,53.914286,374.057143,4.457143,6.969275,12.295179,"{'VERB': 251, 'ADP': 205, 'NOUN': 551, 'PUNCT'...","{'LOC': 3, 'ORG': 20, 'DATE': 15, 'PERSON': 25...",0.462154
4,HF-HE,Beverages,0.50874,0.888757,0.626011,0.657926,43.78,250.05,4.335,6.011502,10.600008,"{'PRON': 658, 'ADJ': 602, 'NOUN': 2315, 'PUNCT...","{'CARDINAL': 25, 'ORG': 92, 'TIME': 10, 'DATE'...",0.480132


In [7]:
caption_quality_score_df2.shape

(40, 14)

In [8]:
caption_quality_score_df2.to_excel(f'D://Study//Project//Major project//Bio analysis//biography_tect_analysis.xlsx', index=False)