# 09 Summarize Track Records


## 09.01 Imports


### 09.01.01 Python Imports


In [80]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from statistics import mean

### 09.01.02 Feature Film Import


In [81]:
Disney_df = pd.read_csv('../Bens_Data/Disney_imdb_scrape_final.csv')
Disney_df.drop(columns=['Unnamed: 0'], inplace=True)

### 09.01.03 Directors and Writers History Import


In [82]:
directors_df = pd.read_csv('../Bens_Data/directors_demo_history.csv')
directors_df.drop(columns=['Unnamed: 0'], inplace=True)
writers_df = pd.read_csv('../Bens_Data/writers_demo_history.csv')
writers_df.drop(columns=['Unnamed: 0'], inplace=True)

## 09.02 Cleanup Release Date for Featured Films


In [83]:
Disney_df['rlsdt'] = Disney_df['rlsdt'].str.replace(r"\(.*\)","")
Disney_df['rlsdt']= pd.to_datetime(Disney_df['rlsdt'],errors='coerce')
Disney_df.head()

  Disney_df['rlsdt'] = Disney_df['rlsdt'].str.replace(r"\(.*\)","")


Unnamed: 0,DFL_title,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,lower_title,averageRating,numVotes,newurl,rating,votes,year,mpaarating,runtime,rlsdt,budget,worldwide
0,101 Dalmatians,tt0115433,movie,101 Dalmatians,101 Dalmatians,1996,103,"Adventure,Comedy,Crime",101 dalmatians,5.7,109712.0,https://www.imdb.com/title/tt0115433,5.7,110K,1996,G,1h 43m,1996-11-27,"$75,000,000 (estimated)","$320,689,294"
1,102 Dalmatians,tt0211181,movie,102 Dalmatians,102 Dalmatians,2000,100,"Adventure,Comedy,Family",102 dalmatians,4.8,37056.0,https://www.imdb.com/title/tt0211181,4.8,37K,2000,G,1h 40m,2000-11-22,"$85,000,000 (estimated)","$183,611,771"
2,"20,000 Leagues Under the Sea",tt0046672,movie,"20,000 Leagues Under the Sea","20,000 Leagues Under the Sea",1954,127,"Adventure,Drama,Family",20000 leagues under the sea,7.2,33109.0,https://www.imdb.com/title/tt0046672,7.2,33K,1954,G,2h 7m,1955-07-20,"$9,000,000 (estimated)",
3,A Bug's Life,tt0120623,movie,A Bug's Life,A Bug's Life,1998,95,"Adventure,Animation,Comedy",a bugs life,7.2,284538.0,https://www.imdb.com/title/tt0120623,7.2,285K,1998,G,1h 35m,1998-11-25,"$120,000,000 (estimated)","$363,258,859"
4,A Christmas Carol,tt1067106,movie,A Christmas Carol,A Christmas Carol,2009,96,"Adventure,Animation,Comedy",a christmas carol,6.8,112582.0,https://www.imdb.com/title/tt1067106,6.8,113K,2009,PG,1h 36m,2009-11-06,"$200,000,000 (estimated)","$325,286,646"


## 09.03 Process to Find Director's Track Record

For each Feature Film, we want to include the Director's Age at the time of release, a count of their previous feature films, as well as the ROI, IMDB Rating, and Run Time of their previous work.  For our Feature Films that may have more than one director, we want to use some sort of aggregate information.  Average Age, Average Runtime, Count of Directors, etc.  This section of code is us breaking down this process in to baby steps.  

In [84]:
Disney_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 439 entries, 0 to 438
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   DFL_title       439 non-null    object        
 1   tconst          439 non-null    object        
 2   titleType       439 non-null    object        
 3   primaryTitle    439 non-null    object        
 4   originalTitle   439 non-null    object        
 5   startYear       439 non-null    int64         
 6   runtimeMinutes  439 non-null    int64         
 7   genres          439 non-null    object        
 8   lower_title     439 non-null    object        
 9   averageRating   439 non-null    float64       
 10  numVotes        439 non-null    float64       
 11  newurl          439 non-null    object        
 12  rating          439 non-null    float64       
 13  votes           439 non-null    object        
 14  year            439 non-null    int64         
 15  mpaara

In [85]:
directors_df.head()

Unnamed: 0,nconst,director,primaryName,birthYear,tconst,category,primaryTitle,runtimeMinutes,genres,mpaarating,...,rlsdt_mo,rlsdt_day,rlsdt_daynm,budget_adj,worldwide_adj,ROI,averageRating,numVotes,_merge,age
0,nm0000110,director,Kenneth Branagh,1960.0,tt0113403,director,A Midwinter's Tale,99.0,Comedy,R,...,2.0,16.0,Friday,0.0,469571.0,inf,7.2,2577.0,both,35.0
1,nm0000110,director,Kenneth Branagh,1960.0,tt0450972,director,As You Like It,127.0,"Comedy,Drama,Romance",PG,...,9.0,21.0,Friday,0.0,563162.0,inf,6.1,3354.0,both,46.0
2,nm0000110,director,Kenneth Branagh,1960.0,tt0475331,director,The Magic Flute,135.0,"Drama,Musical,Romance",error,...,12.0,13.0,Wednesday,27000000.0,2000853.0,-0.925894,6.5,1236.0,both,46.0
3,nm0000110,director,Kenneth Branagh,1960.0,tt0800369,director,Thor,115.0,"Action,Adventure,Fantasy",PG-13,...,5.0,6.0,Friday,150000000.0,449326618.0,1.995511,7.0,810857.0,both,51.0
4,nm0000110,director,Kenneth Branagh,1960.0,tt12789558,director,Belfast,98.0,"Biography,Drama,History",PG-13,...,11.0,12.0,Friday,0.0,46922870.0,inf,7.3,44239.0,both,61.0


In [168]:
# CLean up release date
directors_df['rlsdt_dt'] = directors_df['rlsdt_dt'].str.replace(r"\(.*\)","")
directors_df['rlsdt_dt']= pd.to_datetime(directors_df['rlsdt_dt'],errors='coerce')
directors_df.head()

AttributeError: Can only use .str accessor with string values!

What if there is more than 1 director?

In [88]:
# This will be used as a test. This film has multiple directors
Disney_df[Disney_df['tconst'] == "tt1049413"]

directors_df[directors_df['tconst'] == "tt1049413"]['nconst'].count()

2

In [89]:
# Here is how we will get the Diresctors if there is more than one
dirs = directors_df[directors_df['tconst'] == "tt1049413"]['nconst']
for dir in dirs:
    print (dir)

nm0230032
nm0677037


What if the director has done more than one film?

In [90]:
# This will be used as a test. This director has done multiple films for Disney
directors_df[(directors_df['nconst']=="nm0064415") &  (directors_df['tconst'] == "tt0054372")]

Unnamed: 0,nconst,director,primaryName,birthYear,tconst,category,primaryTitle,runtimeMinutes,genres,mpaarating,...,rlsdt_mo,rlsdt_day,rlsdt_daynm,budget_adj,worldwide_adj,ROI,averageRating,numVotes,_merge,age
714,nm0064415,director,William Beaudine,1892.0,tt0054372,director,Ten Who Dared,92.0,"Adventure,Family,Western",Not Rated,...,11.0,1.0,Tuesday,0.0,0.0,,5.5,254.0,both,68.0


In [91]:
Disney_df[Disney_df['tconst'] == "tt0054372"]

Unnamed: 0,DFL_title,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,lower_title,averageRating,numVotes,newurl,rating,votes,year,mpaarating,runtime,rlsdt,budget,worldwide
277,Ten Who Dared,tt0054372,movie,Ten Who Dared,Ten Who Dared,1960,92,"Adventure,Family,Western",ten who dared,5.5,254.0,https://www.imdb.com/title/tt0054372,5.5,253,1960,Not Rated,1h 32m,1960-11-01,,


In [92]:
Disney_df['tconst'][277]    # Just confirming what's above

'tt0054372'

In [93]:
# This can be used to get the actual age, or the average age if more than one director
directors_df[directors_df['tconst'] == Disney_df['tconst'][277]]['age'].mean()

68.0

In [94]:
# Getting the nconst for the director for a single film
dnconst = directors_df[directors_df['tconst'] == Disney_df['tconst'][277]]['nconst']
dnconst.reset_index(inplace=True, drop=True)
rlsdt = directors_df[directors_df['tconst'] == Disney_df['tconst'][277]]['rlsdt_dt']
rlsdt.reset_index(inplace=True, drop=True)
dnconst[0], rlsdt[0]

('nm0064415', Timestamp('1960-11-01 00:00:00'))

In [95]:
# Getting all films done by that director
directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]

Unnamed: 0,nconst,director,primaryName,birthYear,tconst,category,primaryTitle,runtimeMinutes,genres,mpaarating,...,rlsdt_mo,rlsdt_day,rlsdt_daynm,budget_adj,worldwide_adj,ROI,averageRating,numVotes,_merge,age
578,nm0064415,director,William Beaudine,1892.0,tt0014353,director,Penrod and Sam,84.0,"Comedy,Drama",error,...,6.0,18.0,Monday,0.0,0.0,,7.0,58.0,both,31.0
579,nm0064415,director,William Beaudine,1892.0,tt0016028,director,Little Annie Rooney,94.0,"Comedy,Drama",Unrated,...,10.0,18.0,Sunday,0.0,0.0,,6.8,969.0,both,33.0
580,nm0064415,director,William Beaudine,1892.0,tt0016706,director,The Canadian,80.0,Romance,Passed,...,11.0,27.0,Saturday,0.0,0.0,,7.6,36.0,both,34.0
581,nm0064415,director,William Beaudine,1892.0,tt0017423,director,Sparrows,109.0,Drama,Unrated,...,9.0,6.0,Monday,463455.0,0.0,-1.0,7.4,1321.0,both,34.0
582,nm0064415,director,William Beaudine,1892.0,tt0018987,director,Heart to Heart,70.0,Comedy,error,...,7.0,22.0,Sunday,0.0,0.0,,5.4,26.0,both,36.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
719,nm0064415,director,William Beaudine,1892.0,tt0130060,director,Killer at Large,61.0,"Crime,Drama,Romance",Approved,...,5.0,31.0,Saturday,0.0,0.0,,7.2,14.0,both,55.0
720,nm0064415,director,William Beaudine,1892.0,tt0148436,director,The Mad Parade,63.0,"Drama,War",Passed,...,9.0,18.0,Friday,0.0,0.0,,7.0,57.0,both,39.0
723,nm0064415,director,William Beaudine,1892.0,tt0207407,director,Detective Kitty O'Day,61.0,"Comedy,Mystery,Romance",Approved,...,5.0,13.0,Saturday,0.0,0.0,,5.4,257.0,both,52.0
724,nm0064415,director,William Beaudine,1892.0,tt0294274,director,Again... Pioneers,72.0,"Drama,Family",error,...,11.0,2.0,Thursday,0.0,0.0,,6.7,31.0,both,58.0


In [96]:
# Getting a count of all films done by that director (prior to release)
directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['tconst'].count()

142

In [97]:
# Getting ROI all films done by that director (prior to release)
directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['ROI'].mean()

-1.0

In [98]:
# Getting IMDB Rating all films done by that director (prior to release)
directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['averageRating'].mean()

5.983098591549295

In [99]:
# Getting IMDB Rating all films done by that director (prior to release)
directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['runtimeMinutes'].mean()

69.52112676056338

In [100]:
directors_df[directors_df['tconst'] == "tt0115433"]

Unnamed: 0,nconst,director,primaryName,birthYear,tconst,category,primaryTitle,runtimeMinutes,genres,mpaarating,...,rlsdt_mo,rlsdt_day,rlsdt_daynm,budget_adj,worldwide_adj,ROI,averageRating,numVotes,_merge,age
1453,nm0378893,director,Stephen Herek,1958.0,tt0115433,director,101 Dalmatians,103.0,"Adventure,Comedy,Crime",G,...,11.0,27.0,Wednesday,75000000.0,320689294.0,3.275857,5.7,109712.0,both,38.0


## 09.04 Directors' Track Record


Now that we have worked out each of the individual steps we need to take and how to account for multiples, we put it all together in a single function and run it for each of our feature films.

In [101]:

Disney_df["DIR_COUNT"] = np.nan 
Disney_df["DIR_AGE"] = np.nan 
Disney_df["DIR_ROI"] = np.nan 
Disney_df["DIR_RTG"] = np.nan 
Disney_df["DIR_RNTM"] = np.nan 
Disney_df["DIR_FILM_COUNT"] = np.nan

# Let's try putting it all together
# INDX = 277

def get_dir_hist(tcon):
    
    
    DIR_COUNT = directors_df[directors_df['tconst'] == tcon]['nconst'].count()
    # print (f'# Dirs: {DIR_COUNT}')
    rlsdt = directors_df[directors_df['tconst'] == tcon]['rlsdt_dt']
    rlsdt.reset_index(inplace=True, drop=True)
    
    if DIR_COUNT == 0:
        return 
    
    elif DIR_COUNT == 1:
    
        dnconst = directors_df[directors_df['tconst'] == tcon]['nconst']
        dnconst.reset_index(inplace=True, drop=True)
        

        DIR_AGE = directors_df[directors_df['tconst'] == tcon]['age'].mean()
        
        DIR_ROI = directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['ROI'].mean()
        DIR_RTG = directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['averageRating'].mean()
        DIR_RNTM = directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['runtimeMinutes'].mean()
        DIR_FILM_COUNT = directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['tconst'].count()
        
        
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_COUNT"] = DIR_COUNT
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_AGE"] = DIR_AGE
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_ROI"] = DIR_ROI
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_RTG"] = DIR_RTG
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_RNTM"] = DIR_RNTM
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_FILM_COUNT"] = DIR_FILM_COUNT
        return 
    
    else:
        # print('more than one')
        dirs = directors_df[directors_df['tconst'] == tcon]['nconst']
        
        for dir in dirs:
            # print (dir)
        
            dnconst = directors_df[directors_df['tconst'] == tcon]['nconst']
            dnconst.reset_index(inplace=True, drop=True)

            DIR_AGE_list = []
            DIR_ROI_list = []
            DIR_RTG_list = []
            DIR_RNTM_list = []
            DIR_FILM_COUNT_list = []

            DIR_AGE = directors_df[directors_df['tconst'] == tcon]['age'].mean()
            DIR_ROI = directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['ROI'].mean()
            DIR_RTG = directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['averageRating'].mean()
            DIR_RNTM = directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['runtimeMinutes'].mean()
            DIR_FILM_COUNT = directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['tconst'].count()

            DIR_AGE_list.append(DIR_AGE)
            DIR_ROI_list.append(DIR_ROI)
            DIR_RTG_list.append(DIR_RTG)
            DIR_RNTM_list.append(DIR_RNTM)
            DIR_FILM_COUNT_list.append(DIR_FILM_COUNT)
            
            
        
        
            
            
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_COUNT"] = DIR_COUNT
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_AGE"] = mean(DIR_AGE_list)
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_ROI"] = mean(DIR_ROI_list)
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_RTG"] = mean(DIR_RTG_list)
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_RNTM"] = mean(DIR_RNTM_list)
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_FILM_COUNT"] = mean(DIR_FILM_COUNT_list)

In [102]:
for tconst in Disney_df['tconst']:
    # print(f'Film: {tconst}')
    get_dir_hist(tconst)
    
Disney_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 439 entries, 0 to 438
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   DFL_title       439 non-null    object        
 1   tconst          439 non-null    object        
 2   titleType       439 non-null    object        
 3   primaryTitle    439 non-null    object        
 4   originalTitle   439 non-null    object        
 5   startYear       439 non-null    int64         
 6   runtimeMinutes  439 non-null    int64         
 7   genres          439 non-null    object        
 8   lower_title     439 non-null    object        
 9   averageRating   439 non-null    float64       
 10  numVotes        439 non-null    float64       
 11  newurl          439 non-null    object        
 12  rating          439 non-null    float64       
 13  votes           439 non-null    object        
 14  year            439 non-null    int64         
 15  mpaara

## 09.05 Process to Find Writer's Track Record

Just like we did for the Directora, for each Feature Film we want to include the Writer's Age at the time of release, a count of their previous feature films, as well as the ROI, IMDB Rating, and Run Time of their previous work.  For our Feature Films that may have more than one writer, we want to use some sort of aggregate information.  Average Age, Average Runtime, Count of Writers, etc.  This section of code is us breaking down this process in to baby steps.  

In [104]:
writers_df.head()

Unnamed: 0,nconst,writer,primaryName,birthYear,tconst,category,primaryTitle,runtimeMinutes,genres,mpaarating,...,rlsdt_mo,rlsdt_day,rlsdt_daynm,budget_adj,worldwide_adj,ROI,averageRating,numVotes,_merge,age
0,nm0000184,writer,George Lucas,1944.0,tt0079576,writer,More American Graffiti,110.0,"Comedy,Drama,War",PG,...,8.0,3.0,Friday,3000000.0,15014674.0,4.004891,5.3,4460.0,both,35.0
1,nm0000184,writer,George Lucas,1944.0,tt0080684,writer,Star Wars: Episode V - The Empire Strikes Back,124.0,"Action,Adventure,Fantasy",PG,...,6.0,20.0,Friday,18000000.0,538375067.0,28.909726,8.7,1240496.0,both,36.0
2,nm0000184,writer,George Lucas,1944.0,tt0082971,writer,Indiana Jones and the Raiders of the Lost Ark,115.0,"Action,Adventure",PG,...,6.0,12.0,Friday,18000000.0,389925971.0,20.662554,8.4,938031.0,both,37.0
3,nm0000184,writer,George Lucas,1944.0,tt0086190,writer,Star Wars: Episode VI - Return of the Jedi,131.0,"Action,Adventure,Fantasy",PG,...,5.0,25.0,Wednesday,32500000.0,475106177.0,13.618652,8.3,1013891.0,both,39.0
4,nm0000184,writer,George Lucas,1944.0,tt0087469,writer,Indiana Jones and the Temple of Doom,118.0,"Action,Adventure",PG,...,5.0,23.0,Wednesday,28000000.0,333107271.0,10.896688,7.5,475269.0,both,40.0


In [105]:
# Clean up the release date format
writers_df['rlsdt_dt'] = writers_df['rlsdt_dt'].str.replace(r"\(.*\)","")
writers_df['rlsdt_dt']= pd.to_datetime(writers_df['rlsdt_dt'],errors='coerce')
writers_df.head()

  writers_df['rlsdt_dt'] = writers_df['rlsdt_dt'].str.replace(r"\(.*\)","")


Unnamed: 0,nconst,writer,primaryName,birthYear,tconst,category,primaryTitle,runtimeMinutes,genres,mpaarating,...,rlsdt_mo,rlsdt_day,rlsdt_daynm,budget_adj,worldwide_adj,ROI,averageRating,numVotes,_merge,age
0,nm0000184,writer,George Lucas,1944.0,tt0079576,writer,More American Graffiti,110.0,"Comedy,Drama,War",PG,...,8.0,3.0,Friday,3000000.0,15014674.0,4.004891,5.3,4460.0,both,35.0
1,nm0000184,writer,George Lucas,1944.0,tt0080684,writer,Star Wars: Episode V - The Empire Strikes Back,124.0,"Action,Adventure,Fantasy",PG,...,6.0,20.0,Friday,18000000.0,538375067.0,28.909726,8.7,1240496.0,both,36.0
2,nm0000184,writer,George Lucas,1944.0,tt0082971,writer,Indiana Jones and the Raiders of the Lost Ark,115.0,"Action,Adventure",PG,...,6.0,12.0,Friday,18000000.0,389925971.0,20.662554,8.4,938031.0,both,37.0
3,nm0000184,writer,George Lucas,1944.0,tt0086190,writer,Star Wars: Episode VI - Return of the Jedi,131.0,"Action,Adventure,Fantasy",PG,...,5.0,25.0,Wednesday,32500000.0,475106177.0,13.618652,8.3,1013891.0,both,39.0
4,nm0000184,writer,George Lucas,1944.0,tt0087469,writer,Indiana Jones and the Temple of Doom,118.0,"Action,Adventure",PG,...,5.0,23.0,Wednesday,28000000.0,333107271.0,10.896688,7.5,475269.0,both,40.0


In [106]:
writers_df['tconst'].value_counts()

tt5095030    5
tt1104001    5
tt0120917    5
tt0304669    5
tt0400497    5
            ..
tt0014468    1
tt0016039    1
tt0017877    1
tt0020007    1
tt4587656    1
Name: tconst, Length: 4347, dtype: int64

In [107]:
writers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5610 entries, 0 to 5609
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   nconst          5610 non-null   object        
 1   writer          5610 non-null   object        
 2   primaryName     5610 non-null   object        
 3   birthYear       4598 non-null   float64       
 4   tconst          5610 non-null   object        
 5   category        5610 non-null   object        
 6   primaryTitle    5610 non-null   object        
 7   runtimeMinutes  5463 non-null   float64       
 8   genres          5610 non-null   object        
 9   mpaarating      5610 non-null   object        
 10  rlsdt_dt        5511 non-null   datetime64[ns]
 11  startYear       5610 non-null   float64       
 12  rlsdt_mo        5511 non-null   float64       
 13  rlsdt_day       5511 non-null   float64       
 14  rlsdt_daynm     5511 non-null   object        
 15  budg

What if there is more than 1 writer?

In [108]:
# This will be used as a test. This film has multiple writers
Disney_df[Disney_df['tconst'] == "tt5095030"]

writers_df[writers_df['tconst'] == "tt5095030"]['nconst'].count()

5

In [109]:
# Here is how we will get the Writers if there is more than one
wtrs = writers_df[writers_df['tconst'] == "tt5095030"]['nconst']
for wtr in wtrs:
    print (wtr)

nm0498278
nm0571344
nm1273099
nm2592245
nm3029372


We can use a function similar what was created for the Directors to get this information

In [110]:

Disney_df["WTR_COUNT"] = np.nan 
Disney_df["WTR_AGE"] = np.nan 
Disney_df["WTR_ROI"] = np.nan 
Disney_df["WTR_RTG"] = np.nan 
Disney_df["WTR_RNTM"] = np.nan 
Disney_df["WTR_FILM_COUNT"] = np.nan

# Let's try putting it all together
# INDX = 277

def get_wtr_hist(tcon):
    
    
    WTR_COUNT = writers_df[writers_df['tconst'] == tcon]['nconst'].count()
    # print (f'# Dirs: {WTR_COUNT}')
    rlsdt = writers_df[writers_df['tconst'] == tcon]['rlsdt_dt']
    rlsdt.reset_index(inplace=True, drop=True)
    
    if WTR_COUNT == 0:
        return 
    
    elif WTR_COUNT == 1:
    
        dnconst = writers_df[writers_df['tconst'] == tcon]['nconst']
        dnconst.reset_index(inplace=True, drop=True)
        

        WTR_AGE = writers_df[writers_df['tconst'] == tcon]['age'].mean()
        
        WTR_ROI = writers_df[(writers_df['nconst'] == dnconst[0]) & (writers_df['rlsdt_dt'] < rlsdt[0])]['ROI'].mean()
        WTR_RTG = writers_df[(writers_df['nconst'] == dnconst[0]) & (writers_df['rlsdt_dt'] < rlsdt[0])]['averageRating'].mean()
        WTR_RNTM = writers_df[(writers_df['nconst'] == dnconst[0]) & (writers_df['rlsdt_dt'] < rlsdt[0])]['runtimeMinutes'].mean()
        WTR_FILM_COUNT = writers_df[(writers_df['nconst'] == dnconst[0]) & (writers_df['rlsdt_dt'] < rlsdt[0])]['tconst'].count()
        
        
        Disney_df.loc[(Disney_df['tconst'] == tcon), "WTR_COUNT"] = WTR_COUNT
        Disney_df.loc[(Disney_df['tconst'] == tcon), "WTR_AGE"] = WTR_AGE
        Disney_df.loc[(Disney_df['tconst'] == tcon), "WTR_ROI"] = WTR_ROI
        Disney_df.loc[(Disney_df['tconst'] == tcon), "WTR_RTG"] = WTR_RTG
        Disney_df.loc[(Disney_df['tconst'] == tcon), "WTR_RNTM"] = WTR_RNTM
        Disney_df.loc[(Disney_df['tconst'] == tcon), "WTR_FILM_COUNT"] = WTR_FILM_COUNT
        return 
    
    else:
        # print('more than one writer')
        wtrs = writers_df[writers_df['tconst'] == tcon]['nconst']
        
        for wtr in wtrs:
            # print (wtr)
        
            dnconst = writers_df[writers_df['tconst'] == tcon]['nconst']
            dnconst.reset_index(inplace=True, drop=True)

            WTR_AGE_list = []
            WTR_ROI_list = []
            WTR_RTG_list = []
            WTR_RNTM_list = []
            WTR_FILM_COUNT_list = []

            WTR_AGE = writers_df[writers_df['tconst'] == tcon]['age'].mean()
            WTR_ROI = writers_df[(writers_df['nconst'] == dnconst[0]) & (writers_df['rlsdt_dt'] < rlsdt[0])]['ROI'].mean()
            WTR_RTG = writers_df[(writers_df['nconst'] == dnconst[0]) & (writers_df['rlsdt_dt'] < rlsdt[0])]['averageRating'].mean()
            WTR_RNTM = writers_df[(writers_df['nconst'] == dnconst[0]) & (writers_df['rlsdt_dt'] < rlsdt[0])]['runtimeMinutes'].mean()
            WTR_FILM_COUNT = writers_df[(writers_df['nconst'] == dnconst[0]) & (writers_df['rlsdt_dt'] < rlsdt[0])]['tconst'].count()

            WTR_AGE_list.append(WTR_AGE)
            WTR_ROI_list.append(WTR_ROI)
            WTR_RTG_list.append(WTR_RTG)
            WTR_RNTM_list.append(WTR_RNTM)
            WTR_FILM_COUNT_list.append(WTR_FILM_COUNT)
            
            
        
        
            
            
        Disney_df.loc[(Disney_df['tconst'] == tcon), "WTR_COUNT"] = WTR_COUNT
        Disney_df.loc[(Disney_df['tconst'] == tcon), "WTR_AGE"] = mean(WTR_AGE_list)
        Disney_df.loc[(Disney_df['tconst'] == tcon), "WTR_ROI"] = mean(WTR_ROI_list)
        Disney_df.loc[(Disney_df['tconst'] == tcon), "WTR_RTG"] = mean(WTR_RTG_list)
        Disney_df.loc[(Disney_df['tconst'] == tcon), "WTR_RNTM"] = mean(WTR_RNTM_list)
        Disney_df.loc[(Disney_df['tconst'] == tcon), "WTR_FILM_COUNT"] = mean(WTR_FILM_COUNT_list)

In [111]:
for tconst in Disney_df['tconst']:
    # print(f'Film: {tconst}')
    get_wtr_hist(tconst)
    
Disney_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 439 entries, 0 to 438
Data columns (total 32 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   DFL_title       439 non-null    object        
 1   tconst          439 non-null    object        
 2   titleType       439 non-null    object        
 3   primaryTitle    439 non-null    object        
 4   originalTitle   439 non-null    object        
 5   startYear       439 non-null    int64         
 6   runtimeMinutes  439 non-null    int64         
 7   genres          439 non-null    object        
 8   lower_title     439 non-null    object        
 9   averageRating   439 non-null    float64       
 10  numVotes        439 non-null    float64       
 11  newurl          439 non-null    object        
 12  rating          439 non-null    float64       
 13  votes           439 non-null    object        
 14  year            439 non-null    int64         
 15  mpaara

We already knew that some of the writer information is missing from IMDB, but it looks like even more of the ROI, Rating, and Runtime data is missing.  We will likely drop these columns.

## 09.06 Clean Up Combined Data


Before we start doing any visualizations or modeling, we need to get everythin in to numerica values and deal with any null values

These are all string types, as we'd expect. We're going to leave these columns in the data for now, but we're not going to be feeding strings in to the model.
 - DFL_title
 - tconst
 - titleType
 - primaryTitle

These are all duplicates of some sort and can be dropped.
 - rating
 - votes
 - year
 - runtime
 - originalTitle
 - lowerTitle 

These are categorical columns so we will create dummies.
 - genres
 - mpaarating

These need to be int types
 - budget
 - worldwide

These are floats, which is fine for now.
 - averageRating
 - numVotes

### 09.06.01 Remove Duplicate Columns


In [None]:
Disney_df.drop(columns=['rating','votes','year','runtime','originalTitle','lower_title'], inplace=True)

In [None]:
Disney_df.head()

Unnamed: 0,DFL_title,tconst,titleType,primaryTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,newurl,...,DIR_ROI,DIR_RTG,DIR_RNTM,DIR_FILM_COUNT,WTR_COUNT,WTR_AGE,WTR_ROI,WTR_RTG,WTR_RNTM,WTR_FILM_COUNT
0,101 Dalmatians,tt0115433,movie,101 Dalmatians,1996,103,"Adventure,Comedy,Crime",5.7,109712.0,https://www.imdb.com/title/tt0115433,...,inf,6.583333,105.0,6.0,2.0,73.0,inf,6.410526,97.789474,19.0
1,102 Dalmatians,tt0211181,movie,102 Dalmatians,2000,100,"Adventure,Comedy,Family",4.8,37056.0,https://www.imdb.com/title/tt0211181,...,1.70572,7.1,83.0,2.0,5.0,104.0,,,,0.0
2,"20,000 Leagues Under the Sea",tt0046672,movie,"20,000 Leagues Under the Sea",1954,127,"Adventure,Drama,Family",7.2,33109.0,https://www.imdb.com/title/tt0046672,...,-1.0,6.44,73.866667,15.0,2.0,85.5,-1.0,6.183333,72.433333,30.0
3,A Bug's Life,tt0120623,movie,A Bug's Life,1998,95,"Adventure,Animation,Comedy",7.2,284538.0,https://www.imdb.com/title/tt0120623,...,,,,0.0,3.0,38.0,1.973084,7.3,93.0,1.0
4,A Christmas Carol,tt1067106,movie,A Christmas Carol,2009,96,"Adventure,Animation,Comedy",6.8,112582.0,https://www.imdb.com/title/tt1067106,...,4.572746,7.3,117.6,15.0,1.0,197.0,inf,6.464,100.860465,50.0


In [None]:
# Manually Fixing another error

In [None]:
Disney_df.at[418,'tconst'] = "tt1049413"
Disney_df.at[418,'startYear'] = 2009
Disney_df.at[418,'runtimeMinutes'] = 96
Disney_df.at[418,'genres'] = "Animation,Adventure,Comedy"
Disney_df.at[418,'averageRating'] = 8.3
Disney_df.at[418,'numVotes'] = 1009222
Disney_df.at[418,'newurl'] = "https://www.imdb.com/title/tt1049413"
Disney_df.at[418,'mpaarating'] = "PG"
Disney_df.at[418,'rlsdt'] = "May 29, 2009 (United States)"
Disney_df.at[418,'budget'] = "$175,000,000 (estimated)"
Disney_df.at[418,'worldwide'] = "$735,099,102"
# Disney_df.at[418,'budget_adj'] = 175000000
# Disney_df.at[418,'worldwide_adj'] = 735099102
# Disney_df.at[418,'RIO'] = 3.2

In [None]:
# These are both TV movies
Disney_df.drop(Disney_df[Disney_df['tconst'] == "tt0070131"].index, inplace = True)
Disney_df.drop(Disney_df[Disney_df['tconst'] == "tt0090840"].index, inplace = True)

Disney_df.reset_index(inplace=True)

In [None]:
Disney_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 27 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   index           437 non-null    int64  
 1   DFL_title       437 non-null    object 
 2   tconst          437 non-null    object 
 3   titleType       437 non-null    object 
 4   primaryTitle    437 non-null    object 
 5   startYear       437 non-null    int64  
 6   runtimeMinutes  437 non-null    int64  
 7   genres          437 non-null    object 
 8   averageRating   437 non-null    float64
 9   numVotes        437 non-null    float64
 10  newurl          437 non-null    object 
 11  mpaarating      437 non-null    object 
 12  rlsdt           437 non-null    object 
 13  budget          260 non-null    object 
 14  worldwide       309 non-null    object 
 15  DIR_COUNT       433 non-null    float64
 16  DIR_AGE         389 non-null    float64
 17  DIR_ROI         283 non-null    flo

### 09.06.02 Clean Up Genre Column

In [None]:
Disney_df['genres'].head()

0        Adventure,Comedy,Crime
1       Adventure,Comedy,Family
2        Adventure,Drama,Family
3    Adventure,Animation,Comedy
4    Adventure,Animation,Comedy
Name: genres, dtype: object

In [None]:
# We use CV to get a list of genres for each film and create the appropriate category columns

cv = CountVectorizer()
cvec = cv.fit(Disney_df['genres'])
csr = cvec.transform(Disney_df['genres'])
cats = pd.DataFrame(csr.todense(), columns=cvec.get_feature_names())


In [None]:
# Combine the category coumns and the original feature films list
Disney_df = pd.concat([Disney_df,cats], axis=1) 

In [None]:
Disney_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 49 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   index           437 non-null    int64  
 1   DFL_title       437 non-null    object 
 2   tconst          437 non-null    object 
 3   titleType       437 non-null    object 
 4   primaryTitle    437 non-null    object 
 5   startYear       437 non-null    int64  
 6   runtimeMinutes  437 non-null    int64  
 7   genres          437 non-null    object 
 8   averageRating   437 non-null    float64
 9   numVotes        437 non-null    float64
 10  newurl          437 non-null    object 
 11  mpaarating      437 non-null    object 
 12  rlsdt           437 non-null    object 
 13  budget          260 non-null    object 
 14  worldwide       309 non-null    object 
 15  DIR_COUNT       433 non-null    float64
 16  DIR_AGE         389 non-null    float64
 17  DIR_ROI         283 non-null    flo

In [None]:
Disney_df[Disney_df['DFL_title'].isnull()]

Unnamed: 0,index,DFL_title,tconst,titleType,primaryTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,...,horror,music,musical,mystery,news,romance,sci,sport,thriller,western


### 09.06.02 Clean Up MPAA Rating Column

In [None]:
Disney_df['mpaarating'].value_counts()

PG           175
G            167
Approved      44
PG-13         32
Not Rated     11
Passed         7
Unrated        1
Name: mpaarating, dtype: int64

In [170]:
# The ratings helped up find films that we should have excluded.  You've seen a few manual clean ups of spcific films up to this point, and this is one of the reasons why.
Disney_df[Disney_df['mpaarating'] =='X']

Unnamed: 0,index,DFL_title,tconst,titleType,primaryTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,...,budget_adj,worldwide_adj,ROI,rlsdt_dt,rlsdt_mo,rlsdt_day,rlsdt_daynm,rlsdt_dayofwk,rlsdt_season,rlsdt_season_NM


In [None]:
# The ratings helped up find films that we should have excluded.  You've seen a few manual clean ups of spcific films up to this point, and this is one of the reasons why.
Disney_df[Disney_df['mpaarating'] =='TV-G']

Unnamed: 0,index,DFL_title,tconst,titleType,primaryTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,...,horror,music,musical,mystery,news,romance,sci,sport,thriller,western


In [None]:
Disney_df['mpaarating'].value_counts()

PG           175
G            167
Approved      44
PG-13         32
Not Rated     11
Passed         7
Unrated        1
Name: mpaarating, dtype: int64

### 'Approved' and 'Passed' pre-date the current MPAA rating system.  
 - 'Passed' what was known as the 'Haynes Code' (1930-1934) 
 - 'Approved' for exhibition (1934-1968)   
 - G, M, R, and X were used from 1968 to 1970 'M' meant "Suggested for mature audiences - Parental discretion advised"
 - G, GP, R, X were used from 1970 to 1972 'GP' meant "All ages admitted – Parental guidance suggested."
 - G, PG, R, X were used from 1973 to 1984 PG meant "Parental guidance suggested – Some material may not be suitable for pre-teenagers."
 - PG-13 was added in 1984 PG-13 meant "Parents strongly cautioned – Some material may be inappropriate for children under 13"
 
### We're going to group these and then create dummy columns. 
  - G : Passed, Approved, G
  - PG : PG
  - PG-13 : PG-13
  - Not Rated : Not Rated, Unrated

 
 With help from https://en.wikipedia.org/wiki/Motion_Picture_Association_film_rating_system#Replacement_of_the_Hays_Code

In [None]:
genres = {"Passed": "G", "Approved": "G", "G": "G", "PG": "PG",  "PG-13": "PG-13", "Unrated": "Not Rated", "Not Rated": "Not Rated"}

In [None]:
Disney_df['newmpaarating'] = Disney_df['mpaarating']

In [None]:
Disney_df.replace({"newmpaarating": genres}, inplace=True)

In [None]:
Disney_df = pd.get_dummies(Disney_df, columns = ['newmpaarating'])

In [None]:
Disney_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 53 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   index                    437 non-null    int64  
 1   DFL_title                437 non-null    object 
 2   tconst                   437 non-null    object 
 3   titleType                437 non-null    object 
 4   primaryTitle             437 non-null    object 
 5   startYear                437 non-null    int64  
 6   runtimeMinutes           437 non-null    int64  
 7   genres                   437 non-null    object 
 8   averageRating            437 non-null    float64
 9   numVotes                 437 non-null    float64
 10  newurl                   437 non-null    object 
 11  mpaarating               437 non-null    object 
 12  rlsdt                    437 non-null    object 
 13  budget                   260 non-null    object 
 14  worldwide                3

In [None]:
Disney_df['action'].value_counts()

0    365
1     72
Name: action, dtype: int64

### 09.06.03 Clean Up Budget Column


In [None]:
Disney_df['budget'].fillna("error",inplace=True)
Disney_df['budget_adj'] = Disney_df['budget'].apply(lambda x: x[:-12] if "estimated" in x else x)
Disney_df['budget_adj'] = Disney_df['budget_adj'].apply(lambda x: x.replace(",",""))
Disney_df['budget_adj'] = Disney_df['budget_adj'].apply(lambda x: x if "error" in x else x[1:])
Disney_df['budget_adj'] = Disney_df['budget_adj'].apply(lambda x: 0 if "error" in x else x)
Disney_df['budget_adj'].head(20)

0      75000000
1      85000000
2       9000000
3     120000000
4     200000000
5             0
6      18000000
7      15000000
8             0
9     100000000
10      7000000
11      5000000
12     28000000
13    183000000
14     28000000
15      3000000
16    200000000
17    170000000
18            0
19            0
Name: budget_adj, dtype: object

In [None]:
def convert(val):
    try:
        return(int(val))
    except:
        return(0)


Disney_df['budget_adj'] = Disney_df['budget_adj'].apply(lambda x: convert(x)) 

In [None]:
Disney_df['budget_adj']

0       75000000
1       85000000
2        9000000
3      120000000
4      200000000
         ...    
432            0
433    165000000
434            0
435            0
436    150000000
Name: budget_adj, Length: 437, dtype: int64

In [None]:
Disney_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 54 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   index                    437 non-null    int64  
 1   DFL_title                437 non-null    object 
 2   tconst                   437 non-null    object 
 3   titleType                437 non-null    object 
 4   primaryTitle             437 non-null    object 
 5   startYear                437 non-null    int64  
 6   runtimeMinutes           437 non-null    int64  
 7   genres                   437 non-null    object 
 8   averageRating            437 non-null    float64
 9   numVotes                 437 non-null    float64
 10  newurl                   437 non-null    object 
 11  mpaarating               437 non-null    object 
 12  rlsdt                    437 non-null    object 
 13  budget                   437 non-null    object 
 14  worldwide                3

### 09.06.04 Clean Up Worldwide Revenue Column


In [None]:
Disney_df['worldwide'].fillna("error",inplace=True)
# Disney_df['worldwide_adj'] = Disney_df['worldwide'].apply(lambda x: x[:-12] if "estimated" in x else x)
Disney_df['worldwide_adj'] = Disney_df['worldwide'].apply(lambda x: x.replace(",",""))
Disney_df['worldwide_adj'] = Disney_df['worldwide_adj'].apply(lambda x: x if "error" in x else x[1:])
Disney_df['worldwide_adj'] = Disney_df['worldwide_adj'].apply(lambda x: 0 if "error" in x else x)
Disney_df['worldwide_adj'].head(20)

0      320689294
1      183611771
2              0
3      363258859
4      325286646
5       12890752
6       35348597
7       13406717
8              0
9      132675864
10      34368475
11      30857747
12     504050219
13    1050693953
14     100654149
15             0
16    1025468216
17     299820798
18      12775590
19             0
Name: worldwide_adj, dtype: object

In [None]:
def convert(val):
    try:
        return(int(val))
    except:
        return(0)


Disney_df['worldwide_adj'] = Disney_df['worldwide_adj'].apply(lambda x: convert(x)) 

In [None]:
Disney_df['worldwide_adj']

0       320689294
1       183611771
2               0
3       363258859
4       325286646
          ...    
432             0
433     471222889
434             0
435          3000
436    1024121104
Name: worldwide_adj, Length: 437, dtype: int64

### 09.06.05 Create ROI Column


In [None]:
Disney_df['ROI'] = (Disney_df.worldwide_adj - Disney_df.budget_adj ) / Disney_df.budget_adj

In [None]:
Disney_df['ROI'].replace([np.inf, -np.inf], np.nan, inplace=True)


In [None]:
Disney_df['ROI']

0      3.275857
1      1.160138
2     -1.000000
3      2.027157
4      0.626433
         ...   
432         NaN
433    1.855896
434         NaN
435         NaN
436    5.827474
Name: ROI, Length: 437, dtype: float64

In [None]:
Disney_df.head()

Unnamed: 0,index,DFL_title,tconst,titleType,primaryTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,...,sport,thriller,western,newmpaarating_G,newmpaarating_Not Rated,newmpaarating_PG,newmpaarating_PG-13,budget_adj,worldwide_adj,ROI
0,0,101 Dalmatians,tt0115433,movie,101 Dalmatians,1996,103,"Adventure,Comedy,Crime",5.7,109712.0,...,0,0,0,1,0,0,0,75000000,320689294,3.275857
1,1,102 Dalmatians,tt0211181,movie,102 Dalmatians,2000,100,"Adventure,Comedy,Family",4.8,37056.0,...,0,0,0,1,0,0,0,85000000,183611771,1.160138
2,2,"20,000 Leagues Under the Sea",tt0046672,movie,"20,000 Leagues Under the Sea",1954,127,"Adventure,Drama,Family",7.2,33109.0,...,0,0,0,1,0,0,0,9000000,0,-1.0
3,3,A Bug's Life,tt0120623,movie,A Bug's Life,1998,95,"Adventure,Animation,Comedy",7.2,284538.0,...,0,0,0,1,0,0,0,120000000,363258859,2.027157
4,4,A Christmas Carol,tt1067106,movie,A Christmas Carol,2009,96,"Adventure,Animation,Comedy",6.8,112582.0,...,0,0,0,0,0,1,0,200000000,325286646,0.626433


In [None]:
Disney_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 56 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   index                    437 non-null    int64  
 1   DFL_title                437 non-null    object 
 2   tconst                   437 non-null    object 
 3   titleType                437 non-null    object 
 4   primaryTitle             437 non-null    object 
 5   startYear                437 non-null    int64  
 6   runtimeMinutes           437 non-null    int64  
 7   genres                   437 non-null    object 
 8   averageRating            437 non-null    float64
 9   numVotes                 437 non-null    float64
 10  newurl                   437 non-null    object 
 11  mpaarating               437 non-null    object 
 12  rlsdt                    437 non-null    object 
 13  budget                   437 non-null    object 
 14  worldwide                4

In [None]:
Disney_df['newmpaarating_G'].value_counts()

0    219
1    218
Name: newmpaarating_G, dtype: int64

### 09.06.05 Clean Up Release Date Column

In [171]:
Disney_df['rlsdt_dt']= pd.to_datetime(Disney_df['rlsdt'],errors='coerce')
# Create Columns for Month, Days, Days of the Week, and Seasons
Disney_df['rlsdt_mo']= pd.DatetimeIndex(Disney_df['rlsdt_dt']).month
Disney_df['rlsdt_day']= pd.DatetimeIndex(Disney_df['rlsdt_dt']).day
Disney_df['rlsdt_daynm']= pd.to_datetime(Disney_df['rlsdt_dt']).dt.day_name()

In [None]:
days = {"Sunday": 1, "Monday": 2, "Tuesday": 3, "Wednesday": 4,  "Thursday": 5, "Friday": 6, "Saturday": 7}

In [None]:
Disney_df['rlsdt_dayofwk'] = Disney_df['rlsdt_daynm']

In [None]:
Disney_df.replace({"rlsdt_dayofwk": days}, inplace=True)

In [None]:
# season 1 spring, 2 summer, 3 fall, 4 winter (Approx.)
seasons = {1: 4, 2: 4, 3: 4, 4: 1,  5: 1, 6: 1, 7: 2, 8: 2, 9: 2, 10: 3, 11:3, 12:3}

In [None]:
# season 1 spring, 2 summer, 3 fall, 4 winter (Approx.)
season_NM = {1: "Winter", 2: "Winter", 3: "Winter", 4: "Spring",  5: "Spring", 6: "Spring", 7: "Summer", 8: "Summer", 9: "Summer", 10: "Fall", 11:"Fall", 12:"Fall"}

In [None]:
Disney_df['rlsdt_season'] = Disney_df['rlsdt_mo']
Disney_df['rlsdt_season_NM'] = Disney_df['rlsdt_mo']

In [None]:
Disney_df.replace({"rlsdt_season": seasons}, inplace=True)
Disney_df.replace({"rlsdt_season_NM": season_NM}, inplace=True)

In [None]:
Disney_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 63 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   index                    437 non-null    int64         
 1   DFL_title                437 non-null    object        
 2   tconst                   437 non-null    object        
 3   titleType                437 non-null    object        
 4   primaryTitle             437 non-null    object        
 5   startYear                437 non-null    int64         
 6   runtimeMinutes           437 non-null    int64         
 7   genres                   437 non-null    object        
 8   averageRating            437 non-null    float64       
 9   numVotes                 437 non-null    float64       
 10  newurl                   437 non-null    object        
 11  mpaarating               437 non-null    object        
 12  rlsdt                    437 non-nul

In [None]:
Disney_df['rlsdt_dayofwk'].value_counts()

6.0    283
4.0     80
5.0     42
3.0     15
7.0     10
2.0      3
1.0      3
Name: rlsdt_dayofwk, dtype: int64

In [None]:
Disney_df['rlsdt_mo'].value_counts()

6.0     60
7.0     55
11.0    54
3.0     54
12.0    41
2.0     34
5.0     31
4.0     30
8.0     26
10.0    23
1.0     17
9.0     11
Name: rlsdt_mo, dtype: int64

## 09.07 Export Files Ready for further EDA and Visualizations


In [None]:
Disney_df.to_csv('../Bens_Data/Disney_Films_For_EDA.csv')