In [2]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('newfriends_quotes.csv')

# Initialize the Scene column with NaN values
df['Scene'] = pd.NA

# Iterate through the DataFrame to update the Scene number whenever 'Scene:' is found in the quote
current_scene = 0
for index, row in df.iterrows():
    if 'Scene:' in row['quote']:
        current_scene += 1
    df.at[index, 'Scene'] = current_scene

# Save the updated DataFrame back to the CSV file
df.to_csv('newfriends_quotes_updated.csv', index=False)

print('Scene column added and updated based on the quotes containing "Scene:".')
print(df.head())


Scene column added and updated based on the quotes containing "Scene:".
     author  episode_number           episode_title  \
0    Monica             1.0  Monica Gets A Roommate   
1      Joey             1.0  Monica Gets A Roommate   
2  Chandler             1.0  Monica Gets A Roommate   
3    Phoebe             1.0  Monica Gets A Roommate   
4    Phoebe             1.0  Monica Gets A Roommate   

                                               quote  quote_order  season  \
0  There's nothing to tell! He's just some guy I ...          0.0     1.0   
1  C'mon, you're going out with the guy! There's ...          1.0     1.0   
2  All right Joey, be nice. So does he have a hum...          2.0     1.0   
3                           Wait, does he eat chalk?          3.0     1.0   
4  Just, 'cause, I don't want her to go through w...          4.0     1.0   

  Scene  
0     0  
1     0  
2     0  
3     0  
4     0  


In [9]:
# Read the CSV file
df = pd.read_csv('newfriends_quotes_updated.csv')

# Convert to JSON
json_data = df.to_json(orient='records')

# Write to a JSON file
with open('friends_quotes.json', 'w') as json_file:
    json_file.write(json_data)

In [2]:
import csv
import json

# Define the input and output file paths
input_csv_file = 'newfriends_quotes_updated.csv'
output_json_file = 'data.json'

# Read the CSV file and convert it to a list of dictionaries
data = []
with open(input_csv_file, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data.append(row)

# Write the list of dictionaries to a JSON file
with open(output_json_file, 'w') as jsonfile:
    json.dump(data, jsonfile, indent=4)


In [10]:
import pandas as pd
import json
# Read the CSV file
df = pd.read_csv('newfriends_quotes_updated.csv')

# Create an empty dictionary for storing the JSON structure
json_data = {"seasons": []}

# Group the data by season and episode
grouped = df.groupby(['season', 'episode_number', 'episode_title'])

for (season, episode_number, episode_title), group in grouped:
    # Create a dictionary for the season if it doesn't exist
    season_dict = {
        "number": season,
        "episodes": []
    }
    
    # Create a dictionary for the episode
    episode_dict = {
        "number": episode_number,
        "title": episode_title,
        "scenes": []
    }
    
    # Group the episode data by scene
    scene_grouped = group.groupby('Scene')
    
    for scene, scene_group in scene_grouped:
        # Create a dictionary for the scene
        scene_dict = {
            "number": scene,
            "quotes": []
        }
        
        # Iterate through quotes in the scene
        for _, row in scene_group.iterrows():
            quote = {
                "author": row['author'],
                "quote": row['quote']
            }
            scene_dict["quotes"].append(quote)
        
        # Append the scene dictionary to the episode's scenes
        episode_dict["scenes"].append(scene_dict)
    
    # Append the episode dictionary to the season's episodes
    season_dict["episodes"].append(episode_dict)
    
    # Append the season dictionary to the main JSON data
    json_data["seasons"].append(season_dict)

# Write JSON data to a file
with open('friends_quotes.json', 'w') as json_file:
    json.dump(json_data, json_file, indent=2)


In [9]:
import csv
import re
from nltk.tokenize import word_tokenize

# Define the input and output file paths
input_csv_file = 'newfriends_quotes_updated.csv'
output_csv_file = 'data_modified.csv'

# Function to remove punctuation marks
def remove_punctuation(text):
    return re.sub(r'[!.,?]', '', text)

# Read the CSV file and process the quotes
with open(input_csv_file, 'r') as csvfile, open(output_csv_file, 'w', newline='') as output_csv:
    reader = csv.DictReader(csvfile)
    fieldnames = reader.fieldnames + ['modified_quote']
    writer = csv.DictWriter(output_csv, fieldnames=fieldnames)
    writer.writeheader()
    
    for row in reader:
        quote = row['quote']
        quote = remove_punctuation(quote)  # Remove punctuation marks
        words = word_tokenize(quote)  # Tokenize the quote into words
        modified_quote = ', '.join(words)  # Separate words by commas
        
        # Update the row with modified quote
        row['modified_quote'] = modified_quote
        
        # Write the modified row to the output CSV file
        writer.writerow(row)

print("Quotes have been modified and saved to 'data_modified.csv'.")


Quotes have been modified and saved to 'data_modified.csv'.


In [11]:
import csv
import json

# Define the input and output file paths
input_csv_file = 'data_modified.csv'
output_json_file = 'data_modified.json'

# Read the modified CSV file and convert it to a list of dictionaries
data_modified = []
with open(input_csv_file, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data_modified.append(row)

# Write the list of dictionaries to a JSON file
with open(output_json_file, 'w') as jsonfile:
    json.dump(data_modified, jsonfile, indent=4)

print("Modified data has been converted to JSON and saved to 'data_modified.json'.")


Modified data has been converted to JSON and saved to 'data_modified.json'.


In [11]:
import pandas as pd
import json

# Read the CSV file
df = pd.read_csv("newfriends_quotes_updated.csv", delimiter="\t")

# Convert to JSON format
data = {"seasons": []}

# Group by season, episode, and scene
grouped = df.groupby(["season", "episode_number", "episode_title", "Scene"])

for (season, episode_number, episode_title, scene), group in grouped:
    scene_data = {
        "number": scene,
        "location": "",  # Add location if available
        "lines": []
    }

    for _, row in group.iterrows():
        line = {
            "character": row["author"],
            "spokenLine": row["quote"],
            "words": row["quote"].lower().split()
        }
        scene_data["lines"].append(line)

    episode_data = {
        "number": episode_number,
        "name": episode_title,
        "scenes": [scene_data]
    }

    # Check if the season already exists
    existing_season = next((s for s in data["seasons"] if s["number"] == season), None)

    if existing_season:
        # Check if the episode already exists
        existing_episode = next((e for e in existing_season["episodes"] if e["number"] == episode_number), None)

        if existing_episode:
            # Add scene to existing episode
            existing_episode["scenes"].append(scene_data)
        else:
            # Add new episode to existing season
            existing_season["episodes"].append(episode_data)
    else:
        # Add new season with episode and scene
        data["seasons"].append({
            "number": season,
            "episodes": [episode_data]
        })

# Write data to JSON file
with open("output.json", "w") as outfile:
    json.dump(data, outfile, indent=4)


KeyError: 'season'

### Couting frquency for word cloud

In [2]:
import json
from collections import Counter
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords data (if not already downloaded)
nltk.download('stopwords')

# Load the JSON data
with open('data_modified.json', 'r') as file:
    data = json.load(file)

# Extract words from each modified quote, excluding stopwords
english_stopwords = set(stopwords.words('english'))
words = []
for item in data:
    words.extend([word.lower() for word in item['modified_quote'].split(', ') if word.lower() not in english_stopwords])

# Count the frequency of each word
word_freq = Counter(words)

# Convert word frequency to JSON format
wordcloud_data = [{'text': word, 'size': freq} for word, freq in word_freq.items()]

# Save the word cloud data to a JSON file
with open('wordcloud_data.json', 'w') as file:
    json.dump(wordcloud_data, file, indent=4)

print("Word cloud data has been saved to 'wordcloud_data.json'.")


[nltk_data] Downloading package stopwords to /Users/nachi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Word cloud data has been saved to 'wordcloud_data.json'.


In [3]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('data_modified.csv')

# Convert DataFrame to JSON
json_data = df.to_dict(orient='records')

# Sort JSON data by season and episode number
sorted_json_data = sorted(json_data, key=lambda x: (float(x['season']), float(x['episode_number'])))

# Group JSON data by season
seasons_dict = {}
for item in sorted_json_data:
    season = item['season']
    if season not in seasons_dict:
        seasons_dict[season] = []
    seasons_dict[season].append(item)

# Convert dictionary to JSON
json_seasons_data = {season: episodes for season, episodes in seasons_dict.items()}

# Print JSON data grouped by season
with open('Friends_season.json', 'w') as json_file:
    json.dump(json_seasons_data, json_file, indent=4)

print("JSON data saved to 'output.json'")


JSON data saved to 'output.json'


In [1]:
import pandas as pd
import json

# Load the data from CSV
df = pd.read_csv('newfriends_quotes_updated.csv')

# Grouping data by season, then by episode, then by scene, and finally organizing quotes
seasons = df.groupby('season')
season_dict = {}
for season, season_df in seasons:
    episode_dict = {}
    episodes = season_df.groupby('episode_number')
    for episode, episode_df in episodes:
        scene_dict = {}
        scenes = episode_df.groupby('Scene')
        for scene, scene_df in scenes:
            quotes_list = []
            for _, row in scene_df.iterrows():
                quote_dict = {'author': row['author'], 'quote': row['quote']}
                quotes_list.append(quote_dict)
            scene_dict[scene] = quotes_list
        episode_dict[episode] = {'title': episode_df['episode_title'].iloc[0], 'scenes': scene_dict}
    season_dict[season] = episode_dict

# Convert dictionary to JSON
json_data = json.dumps(season_dict, indent=4)

# Optionally, save the JSON to a file
with open('quotes.json', 'w') as json_file:
    json_file.write(json_data)

print('JSON conversion and file saving completed.')

JSON conversion and file saving completed.


In [1]:
import json

# Load the JSON data
with open('quotes.json', 'r') as file:
    data = json.load(file)

# Print the structure of the first few entries to understand the format
for key in list(data.keys())[:5]:
    print(f"Key: {key}, Value: {data[key]}")

Key: 1.0, Value: {'1.0': {'title': 'Monica Gets A Roommate', 'scenes': {'0': [{'author': 'Monica', 'quote': "There's nothing to tell! He's just some guy I work with!"}, {'author': 'Joey', 'quote': "C'mon, you're going out with the guy! There's gotta be something wrong with him!"}, {'author': 'Chandler', 'quote': 'All right Joey, be nice. So does he have a hump? A hump and a hairpiece?'}, {'author': 'Phoebe', 'quote': 'Wait, does he eat chalk?'}, {'author': 'Phoebe', 'quote': "Just, 'cause, I don't want her to go through what I went through with Carl- oh!"}, {'author': 'Monica', 'quote': "Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex."}, {'author': 'Chandler', 'quote': 'Sounds like a date to me.'}, {'author': 'Chandler', 'quote': "Alright, so I'm back in high school, I'm standing in the middle of the cafeteria, and I realize I am totally naked."}, {'author': 'All', 'quote': 'Oh, yeah. Had that dream.'}, {'author': 'Chandler'

In [2]:
import json

# Load the JSON data
with open('quotes.json', 'r') as file:
    data = json.load(file)

# Print the structure of the first few entries to understand the format
for key in list(data.keys())[:5]:
    print(f"Key: {key}, Value: {data[key]}")

# Save the printed output in a JSON file
output_data = {
    "structure": [
        {"Key": key, "Value": data[key]} for key in list(data.keys())[:5]
    ]
}

with open('output.json', 'w') as outfile:
    json.dump(output_data, outfile, indent=4)

print("Saved the structure of the first few entries in output.json")


Key: 1.0, Value: {'1.0': {'title': 'Monica Gets A Roommate', 'scenes': {'0': [{'author': 'Monica', 'quote': "There's nothing to tell! He's just some guy I work with!"}, {'author': 'Joey', 'quote': "C'mon, you're going out with the guy! There's gotta be something wrong with him!"}, {'author': 'Chandler', 'quote': 'All right Joey, be nice. So does he have a hump? A hump and a hairpiece?'}, {'author': 'Phoebe', 'quote': 'Wait, does he eat chalk?'}, {'author': 'Phoebe', 'quote': "Just, 'cause, I don't want her to go through what I went through with Carl- oh!"}, {'author': 'Monica', 'quote': "Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex."}, {'author': 'Chandler', 'quote': 'Sounds like a date to me.'}, {'author': 'Chandler', 'quote': "Alright, so I'm back in high school, I'm standing in the middle of the cafeteria, and I realize I am totally naked."}, {'author': 'All', 'quote': 'Oh, yeah. Had that dream.'}, {'author': 'Chandler'

In [4]:
!pip3 install convokit


Collecting convokit
  Downloading convokit-3.0.0.tar.gz (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.2/183.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting dill>=0.2.9
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting clean-text>=0.6.0
  Downloading clean_text-0.6.0-py3-none-any.whl (11 kB)
Collecting dnspython>=1.16.0
  Downloading dnspython-2.6.1-py3-none-any.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.7/307.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting nltk>=3.4
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
from convokit import Corpus, download
corpus = Corpus(filename=download("friends-corpus"))

Downloading friends-corpus to /Users/saumickpradhan/.convokit/downloads/friends-corpus
Downloading friends-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/friends-corpus/friends-corpus.zip (6.1MB)... Done
No configuration file found at /Users/saumickpradhan/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
default_backend: mem


In [6]:
!pip install jsonl2json

Collecting jsonl2json
  Downloading jsonl2json-1.0.0-py3-none-any.whl (3.9 kB)
Installing collected packages: jsonl2json
Successfully installed jsonl2json-1.0.0


In [7]:
from jsonl2json import JsonlToJsonFormatter

jsonl = JsonlToJsonFormatter('utterances.jsonl', 'Realutterances.json')
jsonl.to_json()

In [1]:
import pandas as pd

# Read CSV file into DataFrame
df = pd.read_csv('Realemotions_friends.csv')

# Convert DataFrame to JSON
json_data = df.to_json(orient='records')

# Write JSON to file
with open('JsonRealemotions_friends.json', 'w') as f:
    f.write(json_data)

print("Conversion complete. JSON file saved as 'data.json'.")

Conversion complete. JSON file saved as 'data.json'.
