In [1]:
import pandas as pd

In [346]:
import re
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer

def summarize_text(dataframe, genres, num_ngrams=10):
    # Filling in NaN values
    if dataframe['review'].isnull().any():
        dataframe['review'] = dataframe['review'].fillna('This abstract does not exist')

    # Concatenate all abstracts into a single text
    combined_text = ' '.join(dataframe['review'])

    # Getting tf-idf tables
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([combined_text])
    feature_names = vectorizer.get_feature_names()

    # Generating the best summary for the combined text
    abstract = combined_text
    tfidf_scores = tfidf_matrix.toarray().flatten()

    # Sentence Tokenization
    pattern = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s'
    sentences = re.split(pattern, abstract)
    sentences = [sentence for sentence in sentences if any(char.isalpha() for char in sentence)]

    # N-gram scoring based on tf-idf values
    ngram_scores = {}
    for sentence in sentences:
        words = sentence.split(' ')

        # Generate overlapping n-grams using sliding window
        ngrams_list = []
        window_size = 10
        for i in range(len(words) - window_size + 1):
            ngram = words[i:i + window_size]
            ngrams_list.append(ngram)

        for ngram in ngrams_list:
            tfidf_score = sum(tfidf_scores[vectorizer.vocabulary_.get(word.lower(), -1)] for word in ngram)
            for i in genres:
                if i.lower() in [item.lower() for item in ngram]:
                    tfidf_score += 1.0
            ngram_scores[' '.join(ngram)] = tfidf_score

    # Select top-scoring non-overlapping n-grams
    top_ngrams = []
    for ngram, score in sorted(ngram_scores.items(), key=lambda x: x[1], reverse=True):
        # Check if n-gram contains any words from existing top n-grams
        if not any(word in ' '.join(top_ngrams) for word in ngram.split()):
            top_ngrams.append(ngram)
        if len(top_ngrams) == num_ngrams:
            break

    # Concatenate top n-grams
    summary = ' '.join(top_ngrams)
    return summary

In [357]:
features = pd.read_csv('fixed_preproc.csv')

In [359]:
watchlist = pd.read_csv('Final_valid_files.csv')

In [457]:
paths = ['Evil Dead II 1987',
'Duck Soup 1933',
'Gone Baby Gone 2007',
'WALL·E 2008',
'The Babadook 2014',
'1408 2007',
'Changeling 2008',
'The Lego Movie 2014',
'Draft Day 2014',
'Dunkirk 2017']

In [458]:
movies = ['Evil Dead II',
'Duck Soup',
'Gone Baby Gone',
'WALL·E',
'The Babadook',
'1408',
'Changeling',
'The Lego Movie',
'Draft Day',
'Dunkirk']

In [459]:
new_watchlist = watchlist
for i in range(0,len(watchlist['Input'])):
    if watchlist['Input'][i] not in movies:
        new_watchlist = new_watchlist.drop(i, axis = 0)

In [460]:
new_watchlist = new_watchlist.reset_index()
new_watchlist = new_watchlist.drop('index',axis = 1)

In [461]:
new_watchlist

Unnamed: 0,Movie 1,Movie 2,Movie 3,Movie 4,Movie 5,Movie 6,Movie 7,Movie 8,Movie 9,Movie 10,Input,Review
0,13 Hours,Mr. Nobody,Incendies,The Third Man,Awakenings,The Blues Brothers,Sinister,Mr. Smith Goes to Washington,Annie,The Jungle Book,Evil Dead II,"Based on your movie history, you would likely ..."
1,13 Hours,Mr. Nobody,Incendies,The Third Man,Awakenings,The Blues Brothers,Sinister,Mr. Smith Goes to Washington,Annie,The Jungle Book,Duck Soup,"Yes, you would enjoy Duck Soup. This Marx Brot..."
2,13 Hours,Mr. Nobody,Incendies,The Third Man,Awakenings,The Blues Brothers,Sinister,Mr. Smith Goes to Washington,Annie,The Jungle Book,Gone Baby Gone,"Yes, I would enjoy Gone Baby Gone based on the..."
3,X-Men,Woman in Gold,Bride of Frankenstein,Back to the Future Part II,For Love of the Game,To Kill a Mockingbird,The Prestige,Romeo and Juliet,Deadpool 2,El orfanato,WALL·E,"Yes, I think you would enjoy watching WALL·E. ..."
4,X-Men,Woman in Gold,Bride of Frankenstein,Back to the Future Part II,For Love of the Game,To Kill a Mockingbird,The Prestige,Romeo and Juliet,Deadpool 2,El orfanato,The Babadook,"Yes, I would enjoy The Babadook based on my hi..."
5,X-Men,Woman in Gold,Bride of Frankenstein,Back to the Future Part II,For Love of the Game,To Kill a Mockingbird,The Prestige,Romeo and Juliet,Deadpool 2,El orfanato,1408,"Yes, I believe you would enjoy 1408, based on ..."
6,X-Men,Woman in Gold,Bride of Frankenstein,Back to the Future Part II,For Love of the Game,To Kill a Mockingbird,The Prestige,Romeo and Juliet,Deadpool 2,El orfanato,Changeling,"Yes, I would enjoy Changeling. I have watched ..."
7,X-Men,Woman in Gold,Bride of Frankenstein,Back to the Future Part II,For Love of the Game,To Kill a Mockingbird,The Prestige,Romeo and Juliet,Deadpool 2,El orfanato,The Lego Movie,"Yes, I believe you would enjoy The Lego Movie ..."
8,X-Men,Woman in Gold,Bride of Frankenstein,Back to the Future Part II,For Love of the Game,To Kill a Mockingbird,The Prestige,Romeo and Juliet,Deadpool 2,El orfanato,Draft Day,"Yes, I believe I would enjoy the movie Draft D..."
9,X-Men,Woman in Gold,Bride of Frankenstein,Back to the Future Part II,For Love of the Game,To Kill a Mockingbird,The Prestige,Romeo and Juliet,Deadpool 2,El orfanato,Dunkirk,"Yes, based on your movie history, you would en..."


In [462]:
def createInputs(watchlist, features, watchlistRow):
    genres = []
    for j in range(1,11):
        col = "Movie "+str(j)
        ind = (features[features['Title'] == watchlist[col][0]].index.tolist())
        temp = []
        for movie in watchlist[col]:
            ind = features[features['Title'] == movie].index.tolist()
            for index in ind:
                genres_str = features.loc[index, 'genres']
                genres_list = [item.strip().strip("\"'") for item in genres_str.strip("[]").split(",")]
                temp.extend(genres_list)
        genres.append(temp)
        
    input_1 = []
    for i in genres:
        for j in i:
            if j not in input_1:
                input_1.append(j)
    return input_1

In [463]:
def makeSummaries(watchlist, features, movie_list, movie_paths):
    for i in range(0,len(movie_paths)):
        movie_paths[i] = "C:\\Users\\Tanvi Kandalla\\Downloads\\sem_project(2)\\sem_project\\data\\raw\\2_reviews_per_movie_raw\\"+movie_paths[i]+".csv"
    
    summary = []
    for i in range(0,len(watchlist['Input'])):
        inp = createInputs(watchlist, features, i)
        if not isinstance(movie_list,list):
            ind = movie_list.tolist().index(watchlist['Input'][i])
        else:
            ind = movie_list.index(watchlist['Input'][i])
        review = pd.read_csv(movie_paths[ind])
        x = summarize_text(review,inp,10)
        summary.append(x)
        
    return summary

In [464]:
s = makeSummaries(new_watchlist, features, movies, paths)



In [466]:
new_watchlist['Summaries'] = s

In [467]:
new_watchlist

Unnamed: 0,Movie 1,Movie 2,Movie 3,Movie 4,Movie 5,Movie 6,Movie 7,Movie 8,Movie 9,Movie 10,Input,Review,Summaries
0,13 Hours,Mr. Nobody,Incendies,The Third Man,Awakenings,The Blues Brothers,Sinister,Mr. Smith Goes to Washington,Annie,The Jungle Book,Evil Dead II,"Based on your movie history, you would likely ...",the horror of the first film to the comedy of ...
1,13 Hours,Mr. Nobody,Incendies,The Third Man,Awakenings,The Blues Brothers,Sinister,Mr. Smith Goes to Washington,Annie,The Jungle Book,Duck Soup,"Yes, you would enjoy Duck Soup. This Marx Brot...",has the greatest comedy routine in the history...
2,13 Hours,Mr. Nobody,Incendies,The Third Man,Awakenings,The Blues Brothers,Sinister,Mr. Smith Goes to Washington,Annie,The Jungle Book,Gone Baby Gone,"Yes, I would enjoy Gone Baby Gone based on the...",movie that contains the kind of action and dra...
3,X-Men,Woman in Gold,Bride of Frankenstein,Back to the Future Part II,For Love of the Game,To Kill a Mockingbird,The Prestige,Romeo and Juliet,Deadpool 2,El orfanato,WALL·E,"Yes, I think you would enjoy watching WALL·E. ...","the character development, the special-effects..."
4,X-Men,Woman in Gold,Bride of Frankenstein,Back to the Future Part II,For Love of the Game,To Kill a Mockingbird,The Prestige,Romeo and Juliet,Deadpool 2,El orfanato,The Babadook,"Yes, I would enjoy The Babadook based on my hi...",drama to explain the pain of the mother and th...
5,X-Men,Woman in Gold,Bride of Frankenstein,Back to the Future Part II,For Love of the Game,To Kill a Mockingbird,The Prestige,Romeo and Juliet,Deadpool 2,El orfanato,1408,"Yes, I believe you would enjoy 1408, based on ...",him of the dreadful history of the room and th...
6,X-Men,Woman in Gold,Bride of Frankenstein,Back to the Future Part II,For Love of the Game,To Kill a Mockingbird,The Prestige,Romeo and Juliet,Deadpool 2,El orfanato,Changeling,"Yes, I would enjoy Changeling. I have watched ...",about the history of the corruption of the L.A...
7,X-Men,Woman in Gold,Bride of Frankenstein,Back to the Future Part II,For Love of the Game,To Kill a Mockingbird,The Prestige,Romeo and Juliet,Deadpool 2,El orfanato,The Lego Movie,"Yes, I believe you would enjoy The Lego Movie ...",and so the tension of the drama & action was T...
8,X-Men,Woman in Gold,Bride of Frankenstein,Back to the Future Part II,For Love of the Game,To Kill a Mockingbird,The Prestige,Romeo and Juliet,Deadpool 2,El orfanato,Draft Day,"Yes, I believe I would enjoy the movie Draft D...",to the drama of the last day of the transfer i...
9,X-Men,Woman in Gold,Bride of Frankenstein,Back to the Future Part II,For Love of the Game,To Kill a Mockingbird,The Prestige,Romeo and Juliet,Deadpool 2,El orfanato,Dunkirk,"Yes, based on your movie history, you would en...","the action to the British ports, the Channel, ..."


In [487]:
from rouge import Rouge
ROUGE = Rouge()
ro = []
for i in range(0,len(new_watchlist['Review'])):
    score = ROUGE.get_scores(new_watchlist['Review'][i], new_watchlist['Summaries'][i])
    p = score[0]["rouge-1"]["p"]
    r = score[0]["rouge-1"]["r"]
    f1 = score[0]["rouge-1"]["f"]
    ro.append([p, r, f1])

In [491]:
p_sum = 0
t = []
for score in ro:
    p_sum += score[0]
    t.append(score[0])
total = len(ro)
avg = p_sum/total

In [492]:
print(avg, max(t))

0.21117941997381723 0.2916666666666667


In [None]:
new_watchlist.to_csv('summaries_1.csv')