In [1]:
import pandas as pd

In [346]:
import re
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer

def summarize_text(dataframe, genres, num_ngrams=10):
    # Filling in NaN values
    if dataframe['review'].isnull().any():
        dataframe['review'] = dataframe['review'].fillna('This abstract does not exist')

    # Concatenate all abstracts into a single text
    combined_text = ' '.join(dataframe['review'])

    # Getting tf-idf tables
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([combined_text])
    feature_names = vectorizer.get_feature_names()

    # Generating the best summary for the combined text
    abstract = combined_text
    tfidf_scores = tfidf_matrix.toarray().flatten()

    # Sentence Tokenization
    pattern = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s'
    sentences = re.split(pattern, abstract)
    sentences = [sentence for sentence in sentences if any(char.isalpha() for char in sentence)]

    # N-gram scoring based on tf-idf values
    ngram_scores = {}
    for sentence in sentences:
        words = sentence.split(' ')

        # Generate n-grams using sliding window
        ngrams_list = []
        window_size = 10
        for i in range(len(words) - window_size + 1):
            ngram = words[i:i + window_size]
            ngrams_list.append(ngram)

        for ngram in ngrams_list:
            tfidf_score = sum(tfidf_scores[vectorizer.vocabulary_.get(word.lower(), -1)] for word in ngram)
            for i in genres:
                if i.lower() in [item.lower() for item in ngram]:
                    tfidf_score += 1.0
            ngram_scores[' '.join(ngram)] = tfidf_score

    # Select top-scoring non-overlapping n-grams
    top_ngrams = []
    for ngram, score in sorted(ngram_scores.items(), key=lambda x: x[1], reverse=True):
        # Check if n-gram contains any words from existing top n-grams
        if not any(word in ' '.join(top_ngrams) for word in ngram.split()):
            top_ngrams.append(ngram)
        if len(top_ngrams) == num_ngrams:
            break

    # Concatenate top n-grams
    summary = ' '.join(top_ngrams)
    return summary

In [357]:
features = pd.read_csv('fixed_preproc.csv')

In [359]:
watchlist = pd.read_csv('Final_valid_files.csv')

In [457]:
paths = ['Evil Dead II 1987',
'Duck Soup 1933',
'Gone Baby Gone 2007',
'WALL·E 2008',
'The Babadook 2014',
'1408 2007',
'Changeling 2008',
'The Lego Movie 2014',
'Draft Day 2014',
'Dunkirk 2017']

In [521]:
movies = ['Evil Dead II',
'Duck Soup',
'Gone Baby Gone',
'WALL·E',
'The Babadook',
'1408',
'Changeling',
'The Lego Movie',
'Draft Day',
'Dunkirk']

In [595]:
import os
# Specify the directory path
directory = 'C:\\Users\\Tanvi Kandalla\\Downloads\\sem_project(2)\\sem_project\\data\\raw\\2_reviews_per_movie_raw\\'

# Get the list of file paths in the directory
file_paths = []
for root, directories, files in os.walk(directory):
    for file in files:
        file_paths.append(os.path.join(root, file))
filenames = [os.path.basename(path) for path in file_paths]

In [596]:
file_names = []
for i in range(0,len(filenames)):
    file_names.append(filenames[i][:-9])

In [597]:
filenames

['10 Cloverfield Lane 2016.csv',
 '10 Things I Hate About You 1999.csv',
 '12 Angry Men 1957.csv',
 '12 Monkeys 1995.csv',
 '12 Strong 2018.csv',
 '12 Years a Slave 2013.csv',
 '127 Hours 2010.csv',
 '13 Hours 2016.csv',
 '1408 2007.csv',
 '1917 2019.csv',
 '1941 1979.csv',
 '2001%3A A Space Odyssey 1968.csv',
 '2012 2009.csv',
 '21 2008.csv',
 '21 Jump Street 2012.csv',
 '22 Jump Street 2014.csv',
 '28 Days Later... 2002.csv',
 '28 Weeks Later 2007.csv',
 '3 Idiots 2009.csv',
 '3%3A10 to Yuma 2007.csv',
 '30 Days of Night 2007.csv',
 '300 2006.csv',
 '42 2013.csv',
 '50 First Dates 2004.csv',
 '500 Days of Summer 2009.csv',
 '50_50 2011.csv',
 '54 1998.csv',
 '8 Mile 2002.csv',
 'A Beautiful Mind 2001.csv',
 'A Bridge Too Far 1977.csv',
 "A Bug's Life 1998.csv",
 'A Clockwork Orange 1971.csv',
 "A Hard Day's Night 1964.csv",
 'A Night at the Roxbury 1998.csv',
 'A Nightmare on Elm Street 1984.csv',
 'A Quiet Place 2018.csv',
 'A Star Is Born 2018.csv',
 'A Walk to Remember 2002.csv',


In [598]:
movies = watchlist['Input'].unique()
for i in movies:
    if i not in file_names:
        movies.remove(i)

In [599]:
len(movies)

271

In [600]:
paths = []
for i in range(0,len(movies)):
    if movies[i] in file_names:
        paths.append("C:\\Users\\Tanvi Kandalla\\Downloads\\sem_project(2)\\sem_project\\data\\raw\\2_reviews_per_movie_raw\\"+filenames[i])

In [601]:
len(paths)

271

In [602]:
features['release_date'][0][-4:]

'2008'

In [603]:
new_watchlist = watchlist
for i in range(0,len(watchlist['Input'])):
    if watchlist['Input'][i] not in movies:
        new_watchlist = new_watchlist.drop(i, axis = 0)

In [604]:
new_watchlist = new_watchlist.reset_index()
new_watchlist = new_watchlist.drop('index',axis = 1)

In [605]:
new_watchlist

Unnamed: 0,Movie 1,Movie 2,Movie 3,Movie 4,Movie 5,Movie 6,Movie 7,Movie 8,Movie 9,Movie 10,Input,Review
0,13 Hours,Mr. Nobody,Incendies,The Third Man,Awakenings,The Blues Brothers,Sinister,Mr. Smith Goes to Washington,Annie,The Jungle Book,The Great Dictator,"Yes, I believe you would enjoy The Great Dicta..."
1,13 Hours,Mr. Nobody,Incendies,The Third Man,Awakenings,The Blues Brothers,Sinister,Mr. Smith Goes to Washington,Annie,The Jungle Book,Evil Dead II,"Based on your movie history, you would likely ..."
2,13 Hours,Mr. Nobody,Incendies,The Third Man,Awakenings,The Blues Brothers,Sinister,Mr. Smith Goes to Washington,Annie,The Jungle Book,Duck Soup,"Yes, you would enjoy Duck Soup. This Marx Brot..."
3,13 Hours,Mr. Nobody,Incendies,The Third Man,Awakenings,The Blues Brothers,Sinister,Mr. Smith Goes to Washington,Annie,The Jungle Book,Gone Baby Gone,"Yes, I would enjoy Gone Baby Gone based on the..."
4,X-Men,Woman in Gold,Bride of Frankenstein,Back to the Future Part II,For Love of the Game,To Kill a Mockingbird,The Prestige,Romeo and Juliet,Deadpool 2,El orfanato,WALL·E,"Yes, I think you would enjoy watching WALL·E. ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
383,The Muppet Movie,Hard Ball,Field of Dreams,The Empire Strikes Back,"South Park: Bigger, Longer & Uncut",Oliver & Company,Don't Look Up,Onward,Mononoke-hime,Sanju,Platoon,"Yes, based on your history of movies you would..."
384,Jagten,Coraline,Das weiße Band - Eine deutsche Kindergeschichte,Shrek 2,Cabaret,1 - Nenokkadine,Kapoor & Sons,X-Men: Days of Future Past,Kubo and the Two Strings,Mr. Smith Goes to Washington,Inside Man,"Yes, I think you would enjoy Inside Man. It is..."
385,Jagten,Coraline,Das weiße Band - Eine deutsche Kindergeschichte,Shrek 2,Cabaret,1 - Nenokkadine,Kapoor & Sons,X-Men: Days of Future Past,Kubo and the Two Strings,Mr. Smith Goes to Washington,Raging Bull,"Yes, I believe I would enjoy the movie Raging ..."
386,Jagten,Coraline,Das weiße Band - Eine deutsche Kindergeschichte,Shrek 2,Cabaret,1 - Nenokkadine,Kapoor & Sons,X-Men: Days of Future Past,Kubo and the Two Strings,Mr. Smith Goes to Washington,Star Trek,"Yes, you would most likely enjoy Star Trek. Th..."


In [636]:
def createInputs(watchlist, features, watchlistRow):
    genres = []
    for j in range(1,11):
        col = "Movie "+str(j)
        temp = []
        ind = -1
        for movie in watchlist[col]:
            for title in range(0,len(features['Title'])):
                if watchlist[col][watchlistRow] == features['Title'][title]:
                    ind = title
                    break
            if ind > -1:
                genres_str = features.loc[ind, 'genres']
                genres_list = [item.strip().strip("\"'") for item in genres_str.strip("[]").split(",")]
                temp.extend(genres_list)
        genres.append(temp)
        
    input_1 = []
    for i in genres:
        for j in i:
            if j not in input_1:
                input_1.append(j)
    return input_1

In [637]:
def makeSummaries(watchlist, features, movie_list, movie_paths):
    summary = []
    for i in range(0,len(watchlist['Input'])):
        inp = createInputs(watchlist, features, i)
        if not isinstance(movie_list,list):
            ind = movie_list.tolist().index(watchlist['Input'][i])
        else:
            ind = movie_list.index(watchlist['Input'][i])
        review = pd.read_csv(movie_paths[ind])
        x = summarize_text(review,inp,10)
        summary.append(x)
        
    return summary

In [638]:
features = pd.read_csv("genres.csv")

In [639]:
features

Unnamed: 0,Title,genres
0,13 Hours,"['Horror', ' Thriller', ' Mystery']"
1,X-Men,"['Action', ' Adventure', ' Science Fiction', '..."
2,Minnal Murali,"['Action', ' Comedy', ' Fantasy']"
3,KÃ´kaku KidÃ´tai,"['Action', ' Adventure', ' Animation', ' Crime..."
4,District 9,['Science Fiction']
...,...,...
718,Mononoke-hime,"['Adventure', 'Fantasy', 'Animation']"
719,Coraline,"['Animation', 'Family', 'Fantasy']"
720,Das weiße Band - Eine deutsche Kindergeschichte,"['Drama', 'Mystery']"
721,Cabaret,"['Music', 'Drama', 'Romance']"


In [640]:
s = makeSummaries(new_watchlist, features, movies, paths)





























In [641]:
n = new_watchlist

In [642]:
n['Summaries'] = s

In [643]:
n

Unnamed: 0,Movie 1,Movie 2,Movie 3,Movie 4,Movie 5,Movie 6,Movie 7,Movie 8,Movie 9,Movie 10,Input,Review,Summaries
0,13 Hours,Mr. Nobody,Incendies,The Third Man,Awakenings,The Blues Brothers,Sinister,Mr. Smith Goes to Washington,Annie,The Jungle Book,The Great Dictator,"Yes, I believe you would enjoy The Great Dicta...",is the suspense thriller and there is the mons...
1,13 Hours,Mr. Nobody,Incendies,The Third Man,Awakenings,The Blues Brothers,Sinister,Mr. Smith Goes to Washington,Annie,The Jungle Book,Evil Dead II,"Based on your movie history, you would likely ...",the original author and the comedy The Taming ...
2,13 Hours,Mr. Nobody,Incendies,The Third Man,Awakenings,The Blues Brothers,Sinister,Mr. Smith Goes to Washington,Annie,The Jungle Book,Duck Soup,"Yes, you would enjoy Duck Soup. This Marx Brot...",and music that intensifies the drama almost to...
3,13 Hours,Mr. Nobody,Incendies,The Third Man,Awakenings,The Blues Brothers,Sinister,Mr. Smith Goes to Washington,Annie,The Jungle Book,Gone Baby Gone,"Yes, I would enjoy Gone Baby Gone based on the...","message to the future, the music turns the act..."
4,X-Men,Woman in Gold,Bride of Frankenstein,Back to the Future Part II,For Love of the Game,To Kill a Mockingbird,The Prestige,Romeo and Juliet,Deadpool 2,El orfanato,WALL·E,"Yes, I think you would enjoy watching WALL·E. ...",the highs are the action scenes and not the de...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
383,The Muppet Movie,Hard Ball,Field of Dreams,The Empire Strikes Back,"South Park: Bigger, Longer & Uncut",Oliver & Company,Don't Look Up,Onward,Mononoke-hime,Sanju,Platoon,"Yes, based on your history of movies you would...",all add great elements of comedy and drama to ...
384,Jagten,Coraline,Das weiße Band - Eine deutsche Kindergeschichte,Shrek 2,Cabaret,1 - Nenokkadine,Kapoor & Sons,X-Men: Days of Future Past,Kubo and the Two Strings,Mr. Smith Goes to Washington,Inside Man,"Yes, I think you would enjoy Inside Man. It is...",the adventure with them and the music and dire...
385,Jagten,Coraline,Das weiße Band - Eine deutsche Kindergeschichte,Shrek 2,Cabaret,1 - Nenokkadine,Kapoor & Sons,X-Men: Days of Future Past,Kubo and the Two Strings,Mr. Smith Goes to Washington,Raging Bull,"Yes, I believe I would enjoy the movie Raging ...","comedy in these situations, and the gravity in..."
386,Jagten,Coraline,Das weiße Band - Eine deutsche Kindergeschichte,Shrek 2,Cabaret,1 - Nenokkadine,Kapoor & Sons,X-Men: Days of Future Past,Kubo and the Two Strings,Mr. Smith Goes to Washington,Star Trek,"Yes, you would most likely enjoy Star Trek. Th...",the drama and comedy flowing through the music...


In [644]:
from rouge import Rouge
ROUGE = Rouge()
ro = []
for i in range(0,len(n['Review'])):
    score = ROUGE.get_scores(n['Review'][i], n['Summaries'][i])
    p = score[0]["rouge-1"]["p"]
    r = score[0]["rouge-1"]["r"]
    f1 = score[0]["rouge-1"]["f"]
    ro.append([p, r, f1])

In [645]:
p_sum = 0
t = []
for score in ro:
    p_sum += score[0]
    t.append(score[0])
total = len(ro)
avg = p_sum/total

In [635]:
print(avg, max(t))

0.18826078796689863 0.2463768115942029


In [494]:
new_watchlist.to_csv('summaries_1.csv',index = None)