<a href="https://colab.research.google.com/github/IgnatiusEzeani/welsh-text-summarizer/blob/main/welsh_summariser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setting up

### a. Installing and importing required Python libraries
To do later:
1.   Mount the GDrive
2.   Clone the repo



In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
## install dependencies
def setup(dependencies):
 # create the requirement file with the dependencies
  with open("requirements.txt", "w") as f:
    f.write("\n".join(dependencies))
  !pip install -r requirements.txt
  def install_java():
    !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"   #set environment variable
    !java -version       #check java version

setup(dependencies = ["rouge", "rouge-metric", "lexrank", "summa", "fasttext"])

# load setup.py
!wget https://raw.githubusercontent.com/IgnatiusEzeani/summariser/master/setup.py

# load rouge 2.0 eval jar file
!wget https://github.com/IgnatiusEzeani/summariser/raw/master/rouge2-1.2.2.jar
!wget https://raw.githubusercontent.com/IgnatiusEzeani/summariser/master/rouge.properties

%run setup.py

## Get Articles and Summaries

### a. Extract Wiki articles and summaries

In [None]:
# Extract articles and summaries
data_file = "welsh_wiki_articles.json"
if not os.path.exists(data_file):
  # Download and unzip data.zip
  !wget https://github.com/IgnatiusEzeani/summariser/raw/master/data.zip
  unzip("data.zip")

  folder_path = "data/html"
  welsh_wiki_articles={}
  for fname in os.listdir(folder_path):
      filepath = os.path.join(folder_path, fname)
      html = open(filepath, "r", encoding="utf8").read()
      soup = BeautifulSoup(html, features="html.parser")
      text = soup.get_text()
      #extract row 'fileId', 'title', 'article' and 'summary'
      fileId = getfileID(fname)
      headings = ['title', 'text', 'summary']
      title, article, summary = [
        extract_text(text, f"begin {item}", f"end {item}") for item in headings]
      welsh_wiki_articles[fileId] = {"fileId":f"'{fileId}'", "title":title,
                                     "article":article, "summary":summary}
  # Store in a json file for future use
  welsh_wiki_articles_json_dump = json.dumps(welsh_wiki_articles)
  with open(data_file, "w", encoding = "ISO-8859-1") as jsonfile:
    jsonfile.write(welsh_wiki_articles_json_dump)
    print(f"{data_file} created!")
  
  # Remove the data folder
  shutil.rmtree("data")
else:
  print(f"{data_file} already exists")

# Ensure that "welsh_wiki_articles.json" is available
wiki_articles_summaries_df = \
      pd.io.json.read_json(data_file, orient='index').sort_index()
wiki_articles_summaries_df.head()

### b. Upload human summaries

In [None]:
# Upload and extract Wiki articles and summaries
# !wget https://raw.githubusercontent.com/IgnatiusEzeani/summariser/master/acc_prelim.csv
# human_summaries_prelim_df = pd.read_csv('acc_prelim.csv', encoding="ISO-8859-1")
# human_summaries_prelim_df.dropna(subset=['Crynodeb (tua 250 gair)'], inplace=True)
# print(len(human_summaries_prelim_df))
# human_summaries_prelim_df.head()

In [None]:
# Upload and extract Wiki articles and summaries
# import pandas as pd
!wget https://github.com/IgnatiusEzeani/summariser/raw/master/acc_data.csv
human_summaries_df = pd.read_csv('acc_data.csv', encoding="ISO-8859-1")
human_summaries_df.dropna(subset=['Summary'], inplace=True)
# human_summaries_df.dropna(how='all', axis=1, inplace=True)
print(len(human_summaries_df))
# human_summaries_df.head()

## Run Baseline Models

### 1. BottomLine model
- This model basically takes the top sentence i.e. the top sentence from each of the Wiki article.

In [None]:
bottomline_summaries_df = summarize(wiki_articles_summaries_df, "bottomline")
# bottomline_summaries_df.head()

### 2. LexRank model
Uses lexrank (https://pypi.org/project/lexrank/)

In [None]:
lexrank_summaries_df = summarize(wiki_articles_summaries_df, "lexrank")
# lexrank_summaries_df.head()

### 3. TextRank model
Uses `summa` (https://pypi.org/project/summa/)

In [None]:
textrank_summaries_df = summarize(wiki_articles_summaries_df, "textrank")
# textrank_summaries_df.head()

## Run Topline Models

### 1. TfIdf model
Uses `sklearn`'s [`TfidfVectorizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) and [`cosine_similarity`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html)

In [None]:
tfidf_summaries_df = summarize(wiki_articles_summaries_df,"tfidf")
tfidf_summaries_df.head()

### 2. Word Embeddings (FastText)
Source [FastText Library for efficient text classification and representation learning](https://fasttext.cc/docs/en/crawl-vectors.html)

In [None]:
features = get_corpus_features(wiki_articles_summaries_df) # default features=100

In [None]:
if os.path.exists("fastext_feat100_pkl"):
  filtered_fasttext = pickle.load(open('fastext_feat100_pkl', 'rb'))
else:
  !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cy.300.bin.gz
  unzip('cc.cy.300.bin.gz')
  fasttext_model = fasttext.load_model('cc.cy.300.bin')
  filtered_fastext = {feature:fasttext_model[feature] for feature in features}

  # pickle filtered fasttext for future use
  with open('fastext_feat100_pkl', 'wb') as fasttext_pkl:
    pickle.dump(filtered_fastext, fasttext_pkl)
  !rm 'cc.cy.300.bin'

In [None]:
# TODO: modify to use filtered_fastext keys as features later...
fasttext_summaries_df = summarize(wiki_articles_summaries_df, "fasttext",
                                  filtered_fastext) 
fasttext_summaries_df.head()

### 3. Word Embeddings (WNLT)
Source: Pretrained model from [Welsh Natural Language Technology](https://datainnovation.cardiff.ac.uk/is/wecy/access.html) resources.

In [None]:
%run setup.py

In [None]:
if os.path.exists("wnlt_feat100_pkl"):
  filtered_wnlt = pickle.load(open('wnlt_feat100_pkl', 'rb'))
else:
  # !wget https://datainnovation.cardiff.ac.uk/is/wecy/files/FastText_WNLT_SkipGram.zip
  unzip("FastText_WNLT_SkipGram.zip")

  # Loading in FastText_WNLT_SkipGram
  wnlt_dir = "./FastText_WNLT_SkipGram/FastText_WNLT_SkipGram"
  wnlt_model = gensim.models.FastText.load(os.path.join(wnlt_dir, 'skipgram_subword_model.model'))

  filtered_wnlt = {feature:wnlt_model[feature] for feature in features}

  # pickle filtered WNLT for future use
  with open('wnlt_feat100_pkl', 'wb') as fasttext_pkl:
    pickle.dump(filtered_fastext, fasttext_pkl)

  shutil.rmtree('FastText_WNLT_SkipGram')

In [None]:
wnlt_summaries_df = summarize(wiki_articles_summaries_df, "wnlt", filtered_wnlt)
wnlt_summaries_df.head()

## Run Model Evaluation
Uses `rouge-2.0` (https://github.com/kavgan/ROUGE-2.0)

### Generating reference and system summaries

In [None]:
# # download and unzip system summaries
# #Todo: put in a function
# !wget https://github.com/IgnatiusEzeani/summariser/raw/master/system_summaries.zip
# zip_file = "system_summaries.zip"
# unzip(zip_file)

# system_summaries_dir = os.path.join('.',zip_file[:-4],zip_file[:-4])

# summaries_bottomline_df = pd.read_json(os.path.join(system_summaries_dir,
#                                       'summaries_bottomline.json'),
#                                        orient="index")
# summaries_lexrank_df = pd.read_json(os.path.join(system_summaries_dir,
#                                       'summaries_lexrank.json'),
#                                        orient="index")
# summaries_textrank_df = pd.read_json(os.path.join(system_summaries_dir,
#                                       'summaries_textrank.json'),
#                                        orient="index")
# summaries_tfidf_df = pd.read_json(os.path.join(system_summaries_dir,
#                                       'summaries_tfidf.json'),
#                                        orient="index")
# summaries_fasttext_df = pd.read_json(os.path.join(system_summaries_dir,
#                                       'summaries_fasttext.json'),
#                                        orient="index")
# summaries_wnlt_df = pd.read_json(os.path.join(system_summaries_dir,
#                                       'summaries_wnlt.json'),
#                                        orient="index")
summaries_bottomline_df = pd.read_json('summaries_bottomline.json', orient="index")
summaries_lexrank_df = pd.read_json('summaries_lexrank.json', orient="index")
summaries_textrank_df = pd.read_json('summaries_textrank.json', orient="index")
summaries_tfidf_df = pd.read_json('summaries_tfidf.json', orient="index")
summaries_fasttext_df = pd.read_json('summaries_fasttext.json', orient="index")
summaries_wnlt_df = pd.read_json('summaries_wnlt.json',orient="index")

In [None]:
# Generating reference and model summaries for evaluation
if os.path.exists('projects'): shutil.rmtree('projects')
# reference_types = [(human_summaries_df, "human"), (wiki_articles_summaries_df, "wiki")]
reference_types = [(wiki_articles_summaries_df, "wiki")]

# Set up models for system summaries
models = [("bottomline", summaries_bottomline_df),
          ("lexrank", summaries_lexrank_df),
          ("textrank", summaries_textrank_df),
          ("tfidf", summaries_tfidf_df),
          ("fasttext", summaries_fasttext_df),
          ("wnlt", summaries_wnlt_df)
          ]
# Generate reference summaries
gen_reference_summaries(reference_types)

# Generate system summaries
gen_system_summaries(models)

### Run evaluation of system summaries on system summaries

In [None]:
# run evaluation
!java -jar "rouge2-1.2.2.jar"

In [None]:
# for model, scores in model_results.items():
#   print(model)
#   for scr in scores:
#     print(scr)

In [None]:
model_results = get_model_results(show_results('results_wiki_all.csv'))

In [None]:
pre, rec, f1 = model_results['BOTTOMLINE1.TXT']
pre, rec, f1

## Plotting Results Graphs and Barcharts
Generate plots of the system output results

In [None]:
model_types = ['BOTTOMLINE1.TXT', 'TEXTRANK1.TXT', 'LEXRANK1.TXT',
                'TFIDF1.TXT', 'FASTTEXT1.TXT', 'WNLT1.TXT']

metric_types = ['Rg-1', 'Rg-2', 'Rg-L', 'R-SU4']
x_ticks = np.arange(len(metric_types))
rows, cols = 2, 3

fig, axs = plt.subplots(rows, cols, figsize=(18,10))
plt.setp(axs, xticks=x_ticks, xticklabels=metric_types)

for i, (r, c) in enumerate(sorted([(i%rows,i%cols) for i in range(len(model_types))])):
  axs[r, c].set_title(model_types[i])

  pre, rec, f1 = model_results[model_types[i]]

  axs[r, c].bar(X_axis - 0.3, pre, 0.3, label = 'Pre')
  axs[r, c].bar(X_axis + 0.0, rec, 0.3, label = 'Rec')
  axs[r, c].bar(X_axis + 0.3, f1,  0.3, label = 'F1')

for ax in axs.flat:
    ax.set(ylabel='Scores', xlabel='Metrics')

# Hide x labels and tick labels for top plots and y ticks for right plots.
# for ax in axs.flat:
#     ax.label_outer()

# plt.legend()
# plt.show()

In [None]:
# Some example data to display
x = np.linspace(0, 2 * np.pi, 400)
y = np.sin(x ** 2)

fig, axs = plt.subplots(2, 3, figsize=(15,10))
axs[0, 0].plot(x, y)
axs[0, 0].set_title('Axis [0, 0]')

# axs[0, 1].plot(x, y, 'tab:orange')
# axs[0, 1].set_title('Axis [0, 1]')
# axs[0, 2].plot(x, -y, 'tab:green')
# axs[0, 2].set_title('Axis [0, 2]')

# axs[1, 0].plot(x, y)
# axs[1, 0].set_title('Axis [1, 0]')
# axs[1, 1].plot(x, y, 'tab:orange')
# axs[1, 1].set_title('Axis [1, 1]')
# axs[1, 2].plot(x, -y, 'tab:green')
# axs[1, 2].set_title('Axis [1, 2]')

for ax in axs.flat:
    ax.set(xlabel='x-label', ylabel='y-label')

# Hide x labels and tick labels for top plots and y ticks for right plots.
for ax in axs.flat:
    ax.label_outer()

In [None]:
res="""
Models+Metric   Pre        Rec     F-Meas
BOTTOMLINE1.TXT:
 ROUGE-1   &    70.52  &  06.34  &  11.15 \\
 ROUGE-2   &    42.20  &  03.71  &  06.53 \\
 ROUGE-L   &    61.69  &  08.25  &  13.94 \\
 ROUGE-SU4 &    44.62  &  03.77  &  06.65 \\
TEXTRANK1.TXT:
 ROUGE-1   &    27.60  &  70.17  &  36.73 \\
 ROUGE-2   &    15.90  &  39.89  &  21.04 \\
 ROUGE-L   &    26.85  &  56.05  &  33.97 \\
 ROUGE-SU4 &    17.36  &  42.82  &  22.83 \\ 
LEXRANK1.TXT:
 ROUGE-1   &    30.10  &  67.44  &  38.11 \\
 ROUGE-2   &    17.09  &  37.53  &  21.43 \\
 ROUGE-L   &    27.85  &  53.08  &  33.96 \\
 ROUGE-SU4 &    18.78  &  40.42  &  23.38 \\

TFIDF1.TXT:
 ROUGE-1   &    29.83  &  67.59  &  37.84 \\
 ROUGE-2   &    17.05  &  37.91  &  21.41 \\
 ROUGE-L   &    28.43  &  52.95  &  34.31 \\
 ROUGE-SU4 &    18.76  &  40.89  &  23.39 \\ 
FASTTEXT1.TXT:
 ROUGE-1   &    29.37  &  68.68  &  37.75 \\
 ROUGE-2   &    16.72  &  38.52  &  21.35 \\
 ROUGE-L   &    27.74  &  55.11  &  34.35 \\
 ROUGE-SU4 &    18.37  &  41.48  &  23.27 \\
WNLT1.TXT:
 ROUGE-1   &    28.66  &  70.78  &  37.50 \\
 ROUGE-2   &    16.48  &  40.04  &  21.37 \\
 ROUGE-L   &    27.24  &  57.13  &  34.35 \\
 ROUGE-SU4 &    17.99  &  43.01  &  23.21 \\
"""

**Human Summaries**
```
Models+Metric   Pre	    Rec     F-Meas
BOTTOMLINE1.TXT:
 ROUGE-1   &	70.52  &  06.34  &  11.15 \\
 ROUGE-2   &	42.20  &  03.71  &  06.53 \\
 ROUGE-L   &	61.69  &  08.25  &  13.94 \\
 ROUGE-SU4 &	44.62  &  03.77  &  06.65 \\
TEXTRANK1.TXT:
 ROUGE-1   &	27.60  &  70.17  &  36.73 \\
 ROUGE-2   &	15.90  &  39.89  &  21.04 \\
 ROUGE-L   &	26.85  &  56.05  &  33.97 \\
 ROUGE-SU4 &	17.36  &  42.82  &  22.83 \\ 
LEXRANK1.TXT:
 ROUGE-1   &	30.10  &  67.44  &  38.11 \\
 ROUGE-2   &	17.09  &  37.53  &  21.43 \\
 ROUGE-L   &	27.85  &  53.08  &  33.96 \\
 ROUGE-SU4 &	18.78  &  40.42  &  23.38 \\
 
TFIDF1.TXT:
 ROUGE-1   &	29.83  &  67.59  &  37.84 \\
 ROUGE-2   &	17.05  &  37.91  &  21.41 \\
 ROUGE-L   &	28.43  &  52.95  &  34.31 \\
 ROUGE-SU4 &	18.76  &  40.89  &  23.39 \\ 
FASTTEXT1.TXT:
 ROUGE-1   &	29.37  &  68.68  &  37.75 \\
 ROUGE-2   &	16.72  &  38.52  &  21.35 \\
 ROUGE-L   &	27.74  &  55.11  &  34.35 \\
 ROUGE-SU4 &	18.37  &  41.48  &  23.27 \\
WNLT1.TXT:
 ROUGE-1   &	28.66  &  70.78  &  37.50 \\
 ROUGE-2   &	16.48  &  40.04  &  21.37 \\
 ROUGE-L   &	27.24  &  57.13  &  34.35 \\
 ROUGE-SU4 &	17.99  &  43.01  &  23.21 \\
```

**Wiki Summaries**
```
Models+Metric   Pre	    Rec     F-Meas
BOTTOMLINE1.TXT:
 ROUGE-1   &	99.51  &  24.45  &  34.07 \\
 ROUGE-2   &	99.50  &  23.79  &  33.17 \\
 ROUGE-L   &	99.53  &  29.03  &  40.26 \\
 ROUGE-SU4 &	99.48  &  23.11  &  32.23 \\
TEXTRANK1.TXT:
 ROUGE-1   &	21.12  &  81.91  &  29.56 \\
 ROUGE-2   &	17.98  &  64.62  &  24.61 \\
 ROUGE-L   &	24.47  &  73.78  &  33.28 \\
 ROUGE-SU4 &	18.67  &  66.19  &  25.40 \\
LEXRANK1.TXT:
 ROUGE-1   &	22.90  &  79.31  &  30.98 \\
 ROUGE-2   &	19.04  &  60.95  &  25.07 \\
 ROUGE-L   &	25.56  &  70.82  &  33.81 \\
 ROUGE-SU4 &	19.87  &  62.54  &  25.97 \\
TFIDF1.TXT:
 ROUGE-1   &	22.88  &  80.41  &  31.11 \\
 ROUGE-2   &	19.42  &  63.72  &  25.81 \\
 ROUGE-L   &	26.44  &  72.38  &  34.93 \\
 ROUGE-SU4 &	20.21  &  65.16  &  26.66 \\
FASTTEXT1.TXT:
 ROUGE-1   &	22.44  &  80.57  &  30.71 \\
 ROUGE-2   &	18.87  &  62.82  &  25.20 \\
 ROUGE-L   &	25.21  &  72.70  &  33.83 \\
 ROUGE-SU4 &	19.67  &  64.43  &  26.08 \\
WNLT1.TXT:
 ROUGE-1   &	21.73  &  81.80  &  30.19 \\
 ROUGE-2   &	18.33  &  63.04  &  24.83 \\
 ROUGE-L   &	24.48  &  73.63  &  33.32 \\
 ROUGE-SU4 &	19.04  &  64.77  &  25.64 \\
```

## Playgound

In [None]:
article_tokens = []
for i in range(513):
  tokens = len(wiki_articles_summaries_df.iloc[i]['article'].split())
  article_tokens.append(tokens)
  if i % 15 == 0: print()
  print(f"{i:>3d} = {tokens:5d}", end=" | ")

print(f"\n{min(article_tokens)}, {max(article_tokens)}")

In [None]:
bins=range(0, 15000, 1000)
bin_dict= {i:0 for i in bins}
for c in article_tokens:
  for rng in bin_dict:
    if rng <= c < rng+1000:
      bin_dict[rng]+=1
      # break
print(f"{bin_dict}, {sum(bin_dict.values())}")
bin_dict = {f"<{str((k+1000)//1000)}k":v for k, v in bin_dict.items()}
print(bin_dict)

In [None]:
bin_dict = {'<1k': 297, '<2k': 139, '<3k': 31, '<4k': 18, '<5k': 13, '<6k': 7, '>6k': 8}
x1 = bin_dict.keys()
y1 = bin_dict.values()
# fig, ax = plt.subplots(figsize=(10, 5))
# fig.subplots_adjust(bottom=0.15, left=0.2)
plt.bar(x1, y1)
locs, labels = plt.xticks()
plt.setp(labels, rotation=90)
plt.xlabel('Token count (x 1000)')
plt.ylabel('No of Articles')

plt.show()