In [2]:
import pandas as pd 
import numpy as np
import re

In [3]:
'''
The purpose of the code in this cell is to tranform the txt files we have into workable csv files##
'''

#Open raw text files containing tables for respective movie information
open_titles_data = open("movie_titles_metadata.txt", mode = "r")
open_characters_data = open("movie_characters_metadata.txt", mode = 'r')
open_lines_data = open("movie_lines.txt", mode = 'r')
open_conversations_data = open("movie_conversations.txt", mode = 'r')


#Create a csv file for each text to work on
titles_csv = open("titles_workable.csv","w+")
characters_csv = open("characters_workable.csv", "w+")
lines_csv = open("lines_workable.csv", "w+")
conversations_csv = open("conversations_workable.csv", "w+")



###Write csv file from raw text files###

#Titles#
for line in open_titles_data:
    #replace commas in any list elements, interferes with parser
    line = line.replace(",","-")
    #replace default separater with comma to read as csv
    titles_csv.write(line.replace(" +++$+++ ",","))
titles_csv.close()


#Characters#
for line in open_characters_data:
    #replace commas in any list elements, interferes with parser
    line = line.replace(",","-")
    #replace default separater with comma to read as csv
    characters_csv.write(line.replace(" +++$+++ ",","))
characters_csv.close()


#Lines#
for line in open_lines_data:
    #replace commas in any list elements, interferes with parser
    line = line.replace(",","-")
    #replace default separater with comma to read as csv
    lines_csv.write(line.replace(" +++$+++ ",","))
lines_csv.close()


#Conversations#
for line in open_conversations_data:
    #replace commas in any list elements, interferes with parser
    line = line.replace(",","-")
    #replace default separater with comma to read as csv
    conversations_csv.write(line.replace(" +++$+++ ",","))
conversations_csv.close()

movie_titles = pd.read_csv("titles_workable.csv", encoding='latin-1', header = None)
movie_characters = pd.read_csv("characters_workable.csv", encoding='latin-1', header = None)
movie_lines = pd.read_csv("lines_workable.csv", encoding='latin-1', header = None)
movie_conversations = pd.read_csv("conversations_workable.csv", encoding='latin-1', header = None)

In [4]:
'''
The purpose of the code in this cell is to create workable dataframes from the csv files we created
'''

#Set column names for each table we have to work with##
movie_titles.columns = ['movieID', 'title', 'year', 'rating', '# of votes', 'genres']
movie_characters.columns = ['characterID', 'name', 'movieID', 'title', 'gender', 'pos in credits']
movie_lines.columns = ['lineID', 'characterID', 'movieID', 'name', 'text']
movie_conversations.columns = ['characterID_first' , 'characterID_second' , 'movieID', 'line_order']


for i in range(len(movie_titles['genres'])):
    #put the original list of genres back together, currently stored as a string representation of a list
    movie_titles['genres'][i] = movie_titles['genres'][i].strip('][').split('- ') 
    
for i in range(len(movie_conversations['line_order'])):
    #put the original list of lines that make a conversation back together, 
    #currently stored as a string representation of a list
    movie_conversations['line_order'][i] = movie_conversations['line_order'][i].strip('][').split('- ')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [5]:
movie_titles.head()

Unnamed: 0,movieID,title,year,rating,# of votes,genres
0,m0,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
1,m1,1492: conquest of paradise,1992,6.2,10421,"['adventure', 'biography', 'drama', 'history']"
2,m2,15 minutes,2001,6.1,25854,"['action', 'crime', 'drama', 'thriller']"
3,m3,2001: a space odyssey,1968,8.4,163227,"['adventure', 'mystery', 'sci-fi']"
4,m4,48 hrs.,1982,6.9,22289,"['action', 'comedy', 'crime', 'drama', 'thrill..."


In [6]:
movie_characters.head()

Unnamed: 0,characterID,name,movieID,title,gender,pos in credits
0,u0,BIANCA,m0,10 things i hate about you,f,4
1,u1,BRUCE,m0,10 things i hate about you,?,?
2,u2,CAMERON,m0,10 things i hate about you,m,3
3,u3,CHASTITY,m0,10 things i hate about you,?,?
4,u4,JOEY,m0,10 things i hate about you,m,6


In [7]:
movie_lines.head()

Unnamed: 0,lineID,characterID,movieID,name,text
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [8]:
movie_conversations.head()

Unnamed: 0,characterID_first,characterID_second,movieID,line_order
0,u0,u2,m0,"['L194', 'L195', 'L196', 'L197']"
1,u0,u2,m0,"['L198', 'L199']"
2,u0,u2,m0,"['L200', 'L201', 'L202', 'L203']"
3,u0,u2,m0,"['L204', 'L205', 'L206']"
4,u0,u2,m0,"['L207', 'L208']"


In [11]:
'''
The purpose of this cell is to construct a dataframe that contains for each year, a count 
of the number of lines in a comedy script by a male actor vs. a female actor
'''
comedy_movies = movie_titles.copy()
comedy_movies["isComedy"] = False
for i in range(len(movie_titles['genres'])):
    comedy_movies["isComedy"][i] = "'comedy'" in comedy_movies['genres'][i]
comedy_movies = comedy_movies[comedy_movies['isComedy'] == True]
comedy_movie_lines = comedy_movies.merge(movie_lines, how = 'inner', on = 'movieID')


#Retrieves relevant info from the lines of comedy movies dataframe
comedy_movie_lines_trim = comedy_movie_lines[['movieID', 'title', 'year', 'lineID', 'characterID', 'name', 'text']]

#Adds in info  about the characters saying those lines
comedy_lines_characterinfo = comedy_movie_lines_trim.merge(movie_characters)

#Compiles a list of years in the dataset
years_recorded = []
for year in comedy_lines_characterinfo['year']:
    if year not in years_recorded:
        years_recorded.append(year)

#Changes list of years to ints and sorts
for i in range(len(years_recorded)):
    years_recorded[i] = int(years_recorded[i][:4])
years_recorded.sort()

#Initializes what will be a nested dictionary. For each year, a count of lines for comedy movies in that year
years_recorded_mflines = {}

#Creates inner dictionary.
for year in years_recorded:
    years_recorded_mflines[str(year)] = {'m': 0, 'f': 0}
    

#For each line in each comedy movie, adds 1 to the count of the respective movie/gender combo
for i in range(len(comedy_lines_characterinfo)):
    if comedy_lines_characterinfo['year'][i] not in years_recorded_mflines:
        pass
    elif comedy_lines_characterinfo['gender'][i] == 'm':
        years_recorded_mflines[ comedy_lines_characterinfo['year'][i] ]['m'] += 1
    elif comedy_lines_characterinfo['gender'][i] == 'f':
        years_recorded_mflines[ comedy_lines_characterinfo['year'][i] ]['f'] += 1

#Creates new dataframe that will be years and number of lines for males and females in that year in comedy movies
comedy_line_breakdown = pd.DataFrame()
comedy_line_breakdown['Year'] = years_recorded_mflines
comedy_line_breakdown['Male Lines'] = 0
comedy_line_breakdown['Female Lines'] = 0

for i in range(len(comedy_line_breakdown)):
    comedy_line_breakdown['Male Lines'][i] = years_recorded_mflines[ comedy_line_breakdown['Year'][i] ]['m']
    comedy_line_breakdown['Female Lines'][i] = years_recorded_mflines[ comedy_line_breakdown['Year'][i] ]['f']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [12]:
comedy_line_breakdown.to_csv(r'C:\Users\peter\Desktop\2950-Project\Comedy_Lines_Year_vs_Gender.csv', index = False, header=True)
comedy_line_breakdown.head()

Unnamed: 0,Year,Male Lines,Female Lines
1931,1931,368,77
1932,1932,218,193
1933,1933,155,0
1934,1934,1110,902
1936,1936,498,147
