## Import Libraries and Load EmoLex
Import all necessary libraries and load the NRC Emotion Lexicon for emotion analysis.

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from unidecode import unidecode
import re
from collections import Counter
import itertools

nltk.download('wordnet')
nltk.download('stopwords')

emolex_path = 'NRC_emotion_lexicon_list.txt'
emolex = pd.read_csv(emolex_path, sep='\t', names=['word', 'emotion', 'association'])
emolex = emolex[emolex['association'] == 1]
emolex_pivot = emolex.pivot(index='word', columns='emotion', values='association').fillna(0)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sedra\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sedra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Define General Functions
This section contains reusable functions for text preprocessing, tokenization, removing stop words, lemmatization, and emotion analysis.

### Included Functions:
1. `clean_text`: Cleans text by removing diacritics, punctuation, and converting to lowercase.
2. `basic_tokenize_text`: Splits text into tokens (words).
3. `remove_stop_words`: Removes common English stop words.
4. `lemmatize_tokens`: Reduces tokens to their base form using lemmatization.
5. `calculate_emotions`: Maps lemmatized tokens to emotions using the EmoLex lexicon.
6. `generate_ngrams`: Generates n-grams (e.g., bigrams, trigrams) from tokens.
7. `unify_columns`: Unifies column names, optionally capitalizes character names, and adds a series identifier

In [15]:

def clean_text(text):
    text = unidecode(text)
    text = re.sub(r'[^\w\s]', '', text) 
    return text.lower()


def basic_tokenize_text(text):
    return str(text).lower().split()


stop_words = set(stopwords.words('english'))
def remove_stop_words(tokens):
    return [token for token in tokens if token not in stop_words]


lemmatizer = WordNetLemmatizer()
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]


def calculate_emotions(lemmas, lexicon):
    emotions = lexicon.reindex(lemmas).sum()
    return emotions


def generate_ngrams(tokens, n):
    return list(zip(*[tokens[i:] for i in range(n)]))


def unify_columns(data, series_name, character_column, line_column, capitalize_characters=False):
    data = data.rename(columns={character_column: 'character', line_column: 'line'})
    
    if capitalize_characters:
        data['character'] = data['character'].str.title()
    
    data['series'] = series_name
    return data

## Define Analysis Functions
Functions for specific phrases and n-gram analysis.
1. `analyze_specific_phrases` : Analyzes specific phrases and their occurences across characters.
2. `analyze_ngrams` : Analyzes unigrams, bigrams and trigrams across characters.

In [16]:
def analyze_specific_phrases(data, series_name, character_column, phrases_to_search):

    all_characters = data[character_column].unique()
    results = []

    for phrase in phrases_to_search:
        filtered = data[data['line'].str.contains(phrase, case=False, na=False)]
        counts = filtered[character_column].value_counts()

        counts_full = {char: counts.get(char, 0) for char in all_characters}

        results.append({
            'phrase': phrase,
            'series': series_name,
            'total': counts.sum(),
            **counts_full
        })

    results_df = pd.DataFrame(results)
    results_df.to_csv(f'FINAL_{series_name}_specific_phrases.csv', index=False)

def analyze_ngrams(data, series_name, character_column):

    all_characters = data[character_column].unique()
    ngram_results = []

    for n in [1, 2, 3]: 
        data[f'{n}_grams'] = data['lemmas'].apply(lambda x: list(generate_ngrams(x, n)))

        all_ngrams = list(itertools.chain.from_iterable(data[f'{n}_grams']))
        all_ngrams = [ngram for ngram in all_ngrams if 'im' not in ngram]

        ngram_counts = Counter(all_ngrams)

        top_ngrams = ngram_counts.most_common(10)

        for ngram, count in top_ngrams:
            character_counts = data[data[f'{n}_grams'].apply(lambda x: ngram in x)][character_column].value_counts()

            character_counts_full = {char: character_counts.get(char, 0) for char in all_characters}

            ngram_results.append({
                'ngram': ' '.join(ngram),
                'ngram_type': f'{n}-gram',
                'series': series_name,
                'count': count,
                **character_counts_full
            })

    ngram_results_df = pd.DataFrame(ngram_results)
    ngram_results_df.to_csv(f'FINAL_{series_name}_ngrams.csv', index=False)


## Define Main Processing Function
This function combines all steps: text cleaning, emotion analysis, and phrase/n-gram analysis.

In [17]:
def process_dataset(data, series_name, character_column, phrases_to_search):

    data['line'] = data['line'].fillna('').astype(str)
    data['clean_line'] = data['line'].apply(clean_text)
    data['tokens'] = data['clean_line'].apply(basic_tokenize_text)
    data['tokens'] = data['tokens'].apply(remove_stop_words)
    data['lemmas'] = data['tokens'].apply(lemmatize_tokens)
    data['gender'] = data['character'].map(gender_mapping[series_name]).fillna('Unknown')

    emotion_columns = emolex_pivot.columns
    data[emotion_columns] = data['lemmas'].apply(lambda lemmas: calculate_emotions(lemmas, emolex_pivot))

    data.to_csv(f'FINAL_{series_name}_data_with_emotions.csv', index=False)

    analyze_specific_phrases(data, series_name, character_column, phrases_to_search)

    analyze_ngrams(data, series_name, character_column)

    ngram_columns = [col for col in data.columns if '_grams' in col]
    data = data.drop(columns=ngram_columns, errors='ignore')

    return data



# Load Datasets
In this section, we load the raw data for the analysis.
Each dataset will be inspected to verify that all required columns are present before proceeding to data processing and analysis.

In [18]:
himym_path = 'FINAL_himym_clean.csv'
tbbt_path = 'FINAL_tbbt_clean.csv'
friends_path = 'FINAL_friends_clean.csv'

himym_data = pd.read_csv(himym_path)
tbbt_data = pd.read_csv(tbbt_path)
friends_data = pd.read_csv(friends_path)

print("HIMYM dataset loaded with columns:", himym_data.columns.tolist())
print("TBBT dataset loaded with columns:", tbbt_data.columns.tolist())
print("Friends dataset loaded with columns:", friends_data.columns.tolist())

HIMYM dataset loaded with columns: ['season', 'episode_num', 'name', 'line', 'title', 'original_air_date', 'imdb_rating', 'total_votes', 'desc', 'word_count']
TBBT dataset loaded with columns: ['season', 'episode_num', 'dialogue', 'person_scene', 'title', 'original_air_date', 'imdb_rating', 'total_votes', 'desc', 'word_count']
Friends dataset loaded with columns: ['season', 'episode_num', 'character', 'line', 'title', 'original_air_date', 'imdb_rating', 'total_votes', 'desc', 'word_count']


## Process Individual Series
Apply the function to process each dataset with its specific parameters.

In [19]:
gender_mapping = {
    'HIMYM': {'Ted': 'Male', 'Marshall': 'Male', 'Barney': 'Male', 'Lily': 'Female', 'Robin': 'Female'},
    'Friends': {'Ross': 'Male', 'Monica': 'Female', 'Chandler': 'Male', 'Rachel': 'Female', 'Joey': 'Male', 'Phoebe': 'Female'},
    'TBBT': {'Sheldon': 'Male', 'Leonard': 'Male', 'Penny': 'Female', 'Howard': 'Male', 'Raj': 'Male', 'Amy': 'Female', 'Bernadette': 'Female'}
}


himym_data_unified = unify_columns(himym_data, 'HIMYM', 'name', 'line')
tbbt_data_unified = unify_columns(tbbt_data, 'TBBT', 'person_scene', 'dialogue')
friends_data_unified = unify_columns(friends_data, 'Friends', 'character', 'line', capitalize_characters=True)


tbbt_data_processed = process_dataset(
    tbbt_data_unified, 
    'TBBT', 
    'character', 
    ['bazinga', 'spock', 'sweetie', 'honey', 'mother', 'jewish', 'science', 'nerd', 'knock']
)

himym_data_processed = process_dataset(
    himym_data_unified, 
    'HIMYM', 
    'character', 
    ['suit up', 'legend', 'met Ted', 'awesome', 'challenge accepted', 'lawyered', 'pause']
)

friends_data_processed = process_dataset(
    friends_data_unified, 
    'Friends', 
    'character', 
    ['how you doin', 'be more', 'on a break', 'oh my god', 'smelly cat', 'i know']
)


## Combine All Processed Datasets
The final step combines all processed datasets (`HIMYM`, `Friends`, and `TBBT`) into one comprehensive dataset. The `series` column differentiates between the shows.

In [20]:

combined_data = pd.concat([himym_data_processed, friends_data_processed, tbbt_data_processed], ignore_index=True)


combined_data.to_csv('FINAL_combined_data.csv', index=False)

print("All datasets have been successfully combined and saved as 'FINAL_combined_data.csv'.")

All datasets have been successfully combined and saved as 'FINAL_combined_data.csv'.
