## **Import the Libraries & Getting Data**

In [None]:
import pandas as pd
import numpy as np
import regex as re
import matplotlib.pyplot as plt
%matplotlib inline
from pandas_profiling import ProfileReport

In [None]:
movie_metadata = pd.read_csv('data/movies_metadata.csv')
mpst_data = pd.read_csv('data/mpst_full_data.csv')

### **Merging both mpst and movies_metadata files based on imdb_id to get our final dataset**

In [None]:
dataset = pd.merge(movie_metadata , mpst_data, on='imdb_id')
dataset.columns

In [None]:
dataset = dataset[['tagline','plot_synopsis','overview']]
dataset.shape

### **Generating the ProfileReport for the dataset**

In [None]:
profile = ProfileReport(dataset, title="Pandas Profiling Report")

In [None]:
profile.to_widgets()

## **Dataset Cleaning**

In [None]:
df = dataset.dropna(axis=0)
df.drop_duplicates(keep='first')
df.shape

In [None]:
df.head(4)

#### **Analyzing the number of words in each columns of dataset**

In [None]:
number_of_words_ps = df.plot_synopsis.apply(lambda x: len(x.split()))
plt.hist(number_of_words_ps, bins=20 )
plt.xlabel("Number of words")
plt.ylabel("Number of Data points")
plt.title("Number of words in Plot Synopsis")
plt.show()

In [None]:
number_of_words_ov = df.overview.apply(lambda x: len(x.split()))
plt.hist(number_of_words_ov, bins=20 )
plt.xlabel("Number of words")
plt.ylabel("Number of Data points")
plt.title("Number of words in Overview")
plt.show()

In [None]:
number_of_words_tl = df.tagline.apply(lambda x: len(x.split()))
plt.hist(number_of_words_tl, bins=20 )
plt.xlabel("Number of words")
plt.ylabel("Number of Data points")
plt.title("Number of words in Taglines")
plt.show()

## **Summarization Models**

### Abstractive Summarization

#### **T5-Small Model**

In [None]:
df_t5_res = pd.read_csv("data/Results/t5_results.csv")
df_t5_res.head(5)

#### **BART Model**

In [None]:
df_bart_res = pd.read_csv("data/Results/bart_results.csv")
df_bart_res.head(5)

### Extractive Summarization

In [None]:
df_extractive_res = pd.read_csv("data/Results/extractive_results.csv")
df_extractive_res.head(5)

## **Evaluation**

In [None]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

### **Rouge-1 Scores**

In [None]:
rouge = Rouge()

In [None]:
def get_rouge_score(pred_val,ref_val):
    r_scores = rouge.get_scores(pred_val, ref_val, avg=True)
    return r_scores['rouge-1']['f']

In [None]:
rouge_t5 = get_rouge_score(df_t5_res["predictions"],df_t5_res["tagline"])
rouge_bart = get_rouge_score(df_bart_res["predictions"],df_bart_res["tagline"])
rouge_ext = get_rouge_score(df_extractive_res["predictions"],df_extractive_res["tagline"])

### **BLEU Score**

In [None]:
def get_bleu_score(pred_val,ref_val):
    avg_blue_score = 0
    for i in range(len(pred_val)):
        pred = pred_val[i].split(' ')
        refer = ref_val[i].split(' ')
        avg_blue_score += sentence_bleu([refer], pred)
    avg_blue_score = avg_blue_score/len(pred_val)
    return avg_blue_score

In [None]:
bleu_t5 = get_bleu_score(df_t5_res["predictions"],df_t5_res["tagline"])
bleu_bart = get_bleu_score(df_bart_res["predictions"],df_bart_res["tagline"])
bleu_ext = get_bleu_score(df_extractive_res["predictions"],df_extractive_res["tagline"])

### **Results**

In [None]:
plt.bar(['T5','BART','BERT'], [rouge_t5,rouge_bart,rouge_ext],
        width = 0.4)
 
plt.xlabel("Summarization Models")
plt.ylabel("Rouge-1 Score")
plt.title("Rouge-1 Score for different Models")
plt.show()

In [None]:
plt.bar(['BART','BERT','T5'], [bleu_bart,bleu_ext,bleu_t5],
        width = 0.4)
 
plt.xlabel("Summarization Models")
plt.ylabel("Bleu Score")
plt.title("Bleu Score for different Models")
plt.show()