In [137]:
import pandas as pd
from pathlib import Path  
from itertools import product


In [113]:
filePath = Path('data/popularmovies.csv')  
df = pd.read_csv(filePath) 
df = df.loc[:, df.columns.intersection(['title','overview', 'popularity'])]
df

Unnamed: 0,title,overview,popularity
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,186.429
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",249.306
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,89.247
3,Schindler's List,The true story of how businessman Oskar Schind...,126.803
4,12 Angry Men,The defense and the prosecution have rested an...,76.259
...,...,...,...
1975,Batman Beyond: Return of the Joker,"The Joker is back with a vengeance, and Neo-Go...",20.815
1976,Crossroads,A wanna-be blues guitar virtuoso seeks a long-...,19.169
1977,Carrie,Withdrawn and sensitive teen Carrie White face...,29.995
1978,Phantom Thread,"In 1950s London, a renowned dressmaker's metic...",24.640


In [115]:
# clean title row
def clean_title(title):
    words = title.split()
    # if (words[0].lower() == 'the'):
    #     return " ".join(words[1:]).lower()
    return title.lower()

df['clean_title'] = df.title.apply(clean_title)
df['title_len'] = df.title.str.split().apply(len)

colNames = ['title','overview','popularity','clean_title','title_len']
df = df.loc[df.title_len > 1, colNames]

df['first_word'] = df.clean_title.str.split().str[0]
df['last_word'] = df.clean_title.str.split().str[-1]
df

Unnamed: 0,title,overview,popularity,clean_title,title_len,first_word,last_word
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,186.429,the shawshank redemption,3,the,redemption
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",249.306,the godfather,2,the,godfather
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,89.247,the godfather part ii,4,the,ii
3,Schindler's List,The true story of how businessman Oskar Schind...,126.803,schindler's list,2,schindler's,list
4,12 Angry Men,The defense and the prosecution have rested an...,76.259,12 angry men,3,12,men
...,...,...,...,...,...,...,...
1970,Santa Claus Is a Stinker,"Two neurotics, working for a suicide hotline o...",12.424,santa claus is a stinker,5,santa,stinker
1971,Nosferatu the Vampyre,A real estate agent leaves behind his beautifu...,53.698,nosferatu the vampyre,3,nosferatu,vampyre
1973,Elisa & Marcela,The film tells the true story of Elisa Sánchez...,9.658,elisa & marcela,3,elisa,marcela
1975,Batman Beyond: Return of the Joker,"The Joker is back with a vengeance, and Neo-Go...",20.815,batman beyond: return of the joker,6,batman,joker


In [116]:
# Do a bland Movie Centipede with just complete words
# add two columns. One for head and one for tail
lastWords = df.groupby('last_word')['clean_title'].apply(list).to_dict()
firstWords = df.groupby('first_word')['clean_title'].apply(list).to_dict()

df['heads'] = df['last_word'].map(lambda x: firstWords.get(x,[]))
df['tails'] = df['first_word'].map(lambda x: lastWords.get(x,[]))

df['tail_length'] = df['tails'].apply(len)
df['head_length'] = df['heads'].apply(len)
df['naiive_score'] = df.tail_length * df.head_length

df = df.sort_values("naiive_score", ascending = False)
df

Unnamed: 0,title,overview,popularity,clean_title,title_len,first_word,last_word,heads,tails,tail_length,head_length,naiive_score
628,One Life,British stockbroker Nicholas Winton visits Cze...,40.643,one life,2,one,life,"[life is beautiful, life in a year, life of br...","[transformers one, father there is only one, r...",7,4,28
847,Father There Is Only One,"Javier is what we have dubbed as a ""husband-in...",18.808,father there is only one,5,father,one,"[one flew over the cuckoo's nest, one hundred ...","[the father, in the name of the father, first ...",3,7,21
780,Day & Night,"When Day, a sunny fellow, encounters Night, a ...",10.364,day & night,3,day,night,"[night of the living dead, night on earth]","[a brighter summer day, terminator 2: judgment...",9,2,18
615,Day for Night,A committed film director struggles to complet...,16.229,day for night,3,day,night,"[night of the living dead, night on earth]","[a brighter summer day, terminator 2: judgment...",9,2,18
401,Three Colors: Red,Part-time model Valentine unexpectedly befrien...,25.535,three colors: red,3,three,red,"[red beard, red shoes and the seven dwarfs, re...","[i'm starting from three, one, two, three, the...",3,5,15
...,...,...,...,...,...,...,...,...,...,...,...,...
672,A Walk to Remember,"When the popular, restless Landon Carter is fo...",63.965,a walk to remember,4,a,remember,[remember the titans],[],0,1,0
671,Train to Busan,When a zombie virus pushes Korea into a state ...,145.493,train to busan,3,train,busan,[],[demon slayer -kimetsu no yaiba- the movie: mu...,4,0,0
669,The Color Purple,An epic tale spanning forty years in the life ...,25.312,the color purple,3,the,purple,"[purple hearts, purple noon]",[],0,2,0
668,Nothing Left to Do But Cry,Two 20th-century friends accidentally stumble ...,10.492,nothing left to do but cry,6,nothing,cry,[],[],0,0,0


In [157]:
df = df.loc[df.naiive_score > 0]
#for every row


def generate_centipedes(row):
    return list(product(row['tails'], [row['clean_title']], row['heads']))
    
df['full_centipede'] = df.apply(generate_centipedes, axis=1)
df_centipede = df.explode('full_centipede', ignore_index=True)
df_centipede[['head', 'body', 'tail']] = pd.DataFrame(df_centipede['full_centipede'].tolist(), index=df_centipede.index)
df_centipede = df_centipede.drop(columns =['tail_length',	'head_length','naiive_score', 'heads', 'tails', 'first_word', 'last_word', 'title_len', 'clean_title', 'title'])
df_centipede = df_centipede.rename(columns={"popularity": "body_pop", "overview": "body_view"})
df_centipede = df_centipede[['head', 'body', 'tail',"body_pop", "body_view"]]
df_


# for row in df.itertuples():
#     for head in list(row.tails):
#         for tail in list(row.heads):
#             print(f"{head}-{row.clean_title}-{tail}")

df_centipede
# make a new df_for each instance that includes the head, body, tail, head_pop, body_pop, tail_pop

Unnamed: 0,head,body,tail,body_pop,body_view
0,transformers one,one life,life is beautiful,40.643,British stockbroker Nicholas Winton visits Cze...
1,transformers one,one life,life in a year,40.643,British stockbroker Nicholas Winton visits Cze...
2,transformers one,one life,life of brian,40.643,British stockbroker Nicholas Winton visits Cze...
3,transformers one,one life,life of pi,40.643,British stockbroker Nicholas Winton visits Cze...
4,father there is only one,one life,life is beautiful,40.643,British stockbroker Nicholas Winton visits Cze...
...,...,...,...,...,...
398,who am i,i can only imagine,imagine me & you,23.101,10-year-old Bart Millard lives with his mother...
399,"watch out, we're mad",mad max 2,2 hearts,31.288,Max Rockatansky returns as the heroic loner wh...
400,to all the boys: always and forever,forever my girl,girl in the basement,29.933,"After being gone for a decade, a country star ..."
401,how to steal a million,million dollar baby,baby driver,30.108,Despondent over a painful estrangement from hi...


ValueError: Length of values (403) does not match length of index (92)

[(1, 5), (1, 6), (1, 7), (2, 5), (2, 6), (2, 7), (3, 5), (3, 6), (3, 7)]
