#### Similarity Indices

---
######  - [Reading the data set](#Reading-the-data-set)
######  - [Combining data for text similarity](#Combining-data-for-text-similarity)
######  - [Overall TFIDF Cosine Similarity](#Overall-TFIDF-Cosine-Similarity)
######  - [Pariwise TFIDF Cosine Similarity](#Pariwise-TFIDF-Cosine-Similarity)
######  - [Writing the dataset](#Writing-the-dataset)
---

#### Reading the data set
######  - [_Click here to move back to index_](#Similarity-Indices)

In [10]:
import sys
sys.path.append('src')  # Use 'src' if it's in the same folder as the notebook


In [24]:
from Data_Preprocessing import load_data

#Reading preprocessed dataset of all TV series

series_1 = load_data('data/processed/processed_series_1.csv')
series_2 = load_data('data/processed/processed_series_2.csv')
series_3 = load_data('data/processed/processed_series_3.csv')
series_4 = load_data('data/processed/processed_series_4.csv')


#### Combining data for text similarity
######  - [_Click here to move back to index_](#Similarity-Indices)

In [60]:
import pandas as pd

dfs = [series_1, series_2, series_3, series_4]
series_names = ['Friends', 'Game of Thrones', 'The Office', 'The Big Bang Theory']
columns = ['series', 'character', 'season', 'season_episode', 'dialogue', 'dialogue_preprocessed', 'dialogue_length']
final_dfs = []

for i in range(len(dfs)):
    df = dfs[i]
    df['series'] = series_names[i]

    # Find top 6 characters based on total dialogue length
    top_chars = (
        df.groupby('character')['dialogue_length']
        .sum()
        .nlargest(6)
        .index
    )

    # Filter and keep only necessary columns
    df_top = df[df['character'].isin(top_chars)][['series', 'character', 'season', 'season_episode', 'dialogue', 'dialogue_preprocessed']]
    final_dfs.append(df_top)

# Combine all
combined_df = pd.concat(final_dfs, ignore_index=True)


In [38]:
combined_df.tail()

Unnamed: 0,series,character,season,season_episode,dialogue,dialogue_preprocessed
136038,The Big Bang Theory,Sheldon,10,10x24,"Uh, breakfast yes, lunch no. I did have a cou...",uh breakfast yes lunch cough drop really ride ...
136039,The Big Bang Theory,Sheldon,10,10x24,How thoughtful. Thank you.,thoughtful thank
136040,The Big Bang Theory,Sheldon,10,10x24,"And I with you. Question, are you seeking a r...",question seeking romantic relationship
136041,The Big Bang Theory,Sheldon,10,10x24,"Well, that would raise a number of problems. ...",well would raise number problem colleague curr...
136042,The Big Bang Theory,Sheldon,10,10x24,"(Knock, knock, knock) Amy. (Knock, knock, kno...",knock knock knock amy knock knock knock amy kn...


#### Overall TFIDF Cosine Similarity
######  - [_Click here to move back to index_](#Similarity-Indices)

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Group by series and character
combined_df = combined_df[['series', 'character', 'dialogue']].dropna()
overall_dialogues = combined_df.groupby(['series', 'character'])['dialogue'].apply(lambda x: ' '.join(x)).reset_index()

# Create MultiIndex for rows and columns
multi_index = pd.MultiIndex.from_frame(overall_dialogues[['series', 'character']])

# Vectorize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(overall_dialogues['dialogue'])

# Similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix)

# Final DataFrame with MultiIndex
similarity_df = pd.DataFrame(similarity_matrix, index=multi_index, columns=multi_index)


#### Pariwise TFIDF Cosine Similarity
######  - [_Click here to move back to index_](#Similarity-Indices)

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pandas as pd

# Prepare your data
dialogue_df = combined_df[['series', 'character', 'dialogue']].dropna()
dialogue_df = dialogue_df[dialogue_df['dialogue'].str.strip() != ''].reset_index(drop=True)

# Vectorize
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(dialogue_df['dialogue'])

# Use Nearest Neighbors to get top 5 most similar for each dialogue
nbrs = NearestNeighbors(n_neighbors=6, metric='cosine', algorithm='brute').fit(tfidf_matrix)
distances, indices = nbrs.kneighbors(tfidf_matrix)

# Store results (excluding self-match at index 0)
results = []
for i, (dists, idxs) in enumerate(zip(distances, indices)):
    for dist, j in zip(dists[1:], idxs[1:]):  # skip self
        similarity = 1 - dist
        if similarity >= 0.7:
            results.append({
                'Dialogue_1': dialogue_df.loc[i, 'dialogue'],
                'Character_1': dialogue_df.loc[i, 'character'],
                'Series_1': dialogue_df.loc[i, 'series'],
                'Dialogue_2': dialogue_df.loc[j, 'dialogue'],
                'Character_2': dialogue_df.loc[j, 'character'],
                'Series_2': dialogue_df.loc[j, 'series'],
                'Cosine_Similarity': similarity
            })

similarity_top_df = pd.DataFrame(results)


#### Writing the dataset
######  - [_Click here to move back to index_](#Similarity-Indices)

In [58]:
from Data_Preprocessing import write_data

# Save to processed
write_data(similarity_df, 'Overall_Similarity.csv')
write_data(similarity_top_df, 'Pairwise_Similarity.csv')

Data written to: C:\Users\utkar\Desktop\PyCharm Projects Spring\Natural Language Processing\data\processed\Overall_Similarity.csv
Data written to: C:\Users\utkar\Desktop\PyCharm Projects Spring\Natural Language Processing\data\processed\Pairwise_Similarity.csv


'data\\processed\\Pairwise_Similarity.csv'