<a href="https://colab.research.google.com/github/IgnatiusEzeani/welsh-text-summarizer/blob/main/welsh_summariser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setting up

### a. Installing and importing required Python libraries

In [None]:
## install dependencies
def setup(dependencies):
 # create the requirement file with the dependencies
  with open("requirements.txt", "w") as f:
    f.write("\n".join(dependencies))
  !pip install -r requirements.txt
  def install_java():
    !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"   #set environment variable
    !java -version       #check java version

setup(dependencies = ["rouge", "rouge-metric", "lexrank", "summa", "fasttext"])

# load setup.py
!wget https://raw.githubusercontent.com/IgnatiusEzeani/summariser/master/setup.py

# load rouge 2.0 eval jar file
!wget https://github.com/IgnatiusEzeani/summariser/raw/master/rouge2-1.2.2.jar
!wget https://raw.githubusercontent.com/IgnatiusEzeani/summariser/master/rouge.properties

%run setup.py

## Get Articles and Summaries

### a. Extract Wiki articles and summaries

In [None]:
# Extract articles and summaries
data_file = "welsh_wiki_articles.json"
if not os.path.exists(data_file):
  # Download and unzip data.zip
  !wget https://github.com/IgnatiusEzeani/summariser/raw/master/data.zip
  unzip("data.zip")

  folder_path = "data/html"
  welsh_wiki_articles={}
  for fname in os.listdir(folder_path):
      filepath = os.path.join(folder_path, fname)
      html = open(filepath, "r", encoding="utf8").read()
      soup = BeautifulSoup(html, features="html.parser")
      text = soup.get_text()
      #extract row 'fileId', 'title', 'article' and 'summary'
      fileId = getfileID(fname)
      headings = ['title', 'text', 'summary']
      title, article, summary = [
        extract_text(text, f"begin {item}", f"end {item}") for item in headings]
      welsh_wiki_articles[fileId] = {"fileId":f"'{fileId}'", "title":title,
                                     "article":article, "summary":summary}
  # Store in a json file for future use
  welsh_wiki_articles_json_dump = json.dumps(welsh_wiki_articles)
  with open(data_file, "w", encoding = "ISO-8859-1") as jsonfile:
    jsonfile.write(welsh_wiki_articles_json_dump)
    print(f"{data_file} created!")
  
  # Remove the data folder
  shutil.rmtree("data")
else:
  print(f"{data_file} already exists")

# Ensure that "welsh_wiki_articles.json" is available
wiki_articles_summaries_df = \
      pd.io.json.read_json(data_file, orient='index').sort_index()
wiki_articles_summaries_df.head()

### b. Upload human summaries

In [None]:
# Upload and extract Wiki articles and summaries
# !wget https://raw.githubusercontent.com/IgnatiusEzeani/summariser/master/acc_prelim.csv
# human_summaries_prelim_df = pd.read_csv('acc_prelim.csv', encoding="ISO-8859-1")
# human_summaries_prelim_df.dropna(subset=['Crynodeb (tua 250 gair)'], inplace=True)
# print(len(human_summaries_prelim_df))
# human_summaries_prelim_df.head()

In [None]:
# Upload and extract Wiki articles and summaries
# import pandas as pd
!wget https://github.com/IgnatiusEzeani/summariser/raw/master/acc_data.csv
human_summaries_df = pd.read_csv('acc_data.csv', encoding="ISO-8859-1")
human_summaries_df.dropna(subset=['Summary'], inplace=True)
# human_summaries_df.dropna(how='all', axis=1, inplace=True)
print(len(human_summaries_df))
# human_summaries_df.head()

## Run Baseline Models

### 1. BottomLine model
- This model basically takes the top sentence i.e. the top sentence from each of the Wiki article.

In [None]:
%run setup.py
bottomline_summaries_df = summarize(wiki_articles_summaries_df, "bottomline")
# bottomline_summaries_df.head()

### 2. LexRank model
Uses lexrank (https://pypi.org/project/lexrank/)

In [None]:
lexrank_summaries_df = summarize(wiki_articles_summaries_df, "lexrank")
# lexrank_summaries_df.head()

### 3. TextRank model
Uses `summa` (https://pypi.org/project/summa/)

In [None]:
textrank_summaries_df = summarize(wiki_articles_summaries_df, "textrank")
# textrank_summaries_df.head()

## Run Topline Models

### 1. TfIdf model
Uses `sklearn`'s [`TfidfVectorizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) and [`cosine_similarity`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html)

In [None]:
tfidf_summaries_df = summarize(wiki_articles_summaries_df, "tfidf")
# tfidf_summaries_df.head()

### 2. Word Embeddings (FastText)
Source [FastText Library for efficient text classification and representation learning](https://fasttext.cc/docs/en/crawl-vectors.html)

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cy.300.bin.gz
unzip('cc.cy.300.bin.gz')
fasttext_model = fasttext.load_model('cc.cy.300.bin')
fasttext_summaries_df = summarize(wiki_articles_summaries_df, "fasttext",
                                                           fasttext_model)
# fasttext_summaries_df.head(10)

### 3. Word Embeddings (WNLT)
Source: Pretrained model from [Welsh Natural Language Technology](https://datainnovation.cardiff.ac.uk/is/wecy/access.html) resources.

In [None]:
!wget https://datainnovation.cardiff.ac.uk/is/wecy/files/FastText_WNLT_SkipGram.zip
unzip("FastText_WNLT_SkipGram.zip")

# Loading in FastText_WNLT_SkipGram
wnlt_dir = "./FastText_WNLT_SkipGram/FastText_WNLT_SkipGram"
wnlt_model = gensim.models.FastText.load(os.path.join(wnlt_dir, 'skipgram_subword_model.model'))
wnlt_summaries_df = summarize(wiki_articles_summaries_df, "wnlt", wnlt_model)
wnlt_summaries_df.head()

## Run Model Evaluation
Uses `rouge-2.0` (https://github.com/kavgan/ROUGE-2.0)

### Generating reference and system summaries

In [None]:
# download and unzip system summaries
!wget https://github.com/IgnatiusEzeani/summariser/raw/master/system_summaries.zip
zip_file = "system_summaries.zip"
unzip(zip_file)

system_summaries_dir = os.path.join('.',zip_file[:-4],zip_file[:-4])

In [None]:
summaries_bottomline_df = pd.read_json(os.path.join(system_summaries_dir,
                                      'summaries_bottomline.json'),
                                       orient="index")
summaries_lexrank_df = pd.read_json(os.path.join(system_summaries_dir,
                                      'summaries_lexrank.json'),
                                       orient="index")
summaries_textrank_df = pd.read_json(os.path.join(system_summaries_dir,
                                      'summaries_textrank.json'),
                                       orient="index")
summaries_tfidf_df = pd.read_json(os.path.join(system_summaries_dir,
                                      'summaries_tfidf.json'),
                                       orient="index")
summaries_fasttext_df = pd.read_json(os.path.join(system_summaries_dir,
                                      'summaries_fasttext.json'),
                                       orient="index")
summaries_wnlt_df = pd.read_json(os.path.join(system_summaries_dir,
                                      'summaries_wnlt.json'),
                                       orient="index")

In [None]:
# Load reference summaries
# reference_types = [(human_summaries_df, "human"), (wiki_articles_summaries_df, "wiki")]
reference_types = [(wiki_articles_summaries_df, "wiki")]

# Set up models for system summaries
models = [("bottomline", summaries_bottomline_df),
          ("lexrank", summaries_lexrank_df),
          ("textrank", summaries_textrank_df),
          ("tfidf", summaries_tfidf_df),
          ("fasttext", summaries_fasttext_df),
          ("wnlt", summaries_wnlt_df)
          ]

In [None]:
gen_references(reference_types)

In [None]:
gen_model_summaries(models)

In [None]:
# run evaluation
!java -jar "rouge2-1.2.2.jar"

In [None]:
model_types = ['BOTTOMLINE1.TXT', 'TEXTRANK1.TXT', 'LEXRANK1.TXT',
               'TFIDF1.TXT', 'FASTTEXT1.TXT', 'WNLT1.TXT']
rouge_types = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'ROUGE-SU4']
eval_results_df = pd.read_csv('results.csv', encoding="ISO-8859-1")
print(f"Models+Metric   Pre\t    Rec     F-Meas")
for model in model_types:
  print(f"{model}:\n", end=" ")
  for metric in rouge_types:
    print(f"{metric:10s}", end="")
    model_result_df = eval_results_df.loc[np.where(
                              (eval_results_df['System Name']==model) & 
                              (eval_results_df['ROUGE-Type']==metric))]
    res=np.mean(model_result_df[['Avg_Recall', 'Avg_Precision',	'Avg_F-Score']])*100
    print(f"&\t{res[1]:05.2f}  &  {res[0]:05.2f}  &  {res[2]:05.2f} \\\\", end="\n ")
  print()

**Human Summaries**
```
Models+Metric   Pre	    Rec     F-Meas
BOTTOMLINE1.TXT:
 ROUGE-1   &	70.52  &  06.34  &  11.15 \\
 ROUGE-2   &	42.20  &  03.71  &  06.53 \\
 ROUGE-L   &	61.69  &  08.25  &  13.94 \\
 ROUGE-SU4 &	44.62  &  03.77  &  06.65 \\
TEXTRANK1.TXT:
 ROUGE-1   &	27.60  &  70.17  &  36.73 \\
 ROUGE-2   &	15.90  &  39.89  &  21.04 \\
 ROUGE-L   &	26.85  &  56.05  &  33.97 \\
 ROUGE-SU4 &	17.36  &  42.82  &  22.83 \\ 
LEXRANK1.TXT:
 ROUGE-1   &	30.10  &  67.44  &  38.11 \\
 ROUGE-2   &	17.09  &  37.53  &  21.43 \\
 ROUGE-L   &	27.85  &  53.08  &  33.96 \\
 ROUGE-SU4 &	18.78  &  40.42  &  23.38 \\
 
TFIDF1.TXT:
 ROUGE-1   &	29.83  &  67.59  &  37.84 \\
 ROUGE-2   &	17.05  &  37.91  &  21.41 \\
 ROUGE-L   &	28.43  &  52.95  &  34.31 \\
 ROUGE-SU4 &	18.76  &  40.89  &  23.39 \\ 
FASTTEXT1.TXT:
 ROUGE-1   &	29.37  &  68.68  &  37.75 \\
 ROUGE-2   &	16.72  &  38.52  &  21.35 \\
 ROUGE-L   &	27.74  &  55.11  &  34.35 \\
 ROUGE-SU4 &	18.37  &  41.48  &  23.27 \\
WNLT1.TXT:
 ROUGE-1   &	28.66  &  70.78  &  37.50 \\
 ROUGE-2   &	16.48  &  40.04  &  21.37 \\
 ROUGE-L   &	27.24  &  57.13  &  34.35 \\
 ROUGE-SU4 &	17.99  &  43.01  &  23.21 \\
```

**Wiki Summaries**
```
Models+Metric   Pre	    Rec     F-Meas
BOTTOMLINE1.TXT:
 ROUGE-1   &	99.51  &  24.45  &  34.07 \\
 ROUGE-2   &	99.50  &  23.79  &  33.17 \\
 ROUGE-L   &	99.53  &  29.03  &  40.26 \\
 ROUGE-SU4 &	99.48  &  23.11  &  32.23 \\
TEXTRANK1.TXT:
 ROUGE-1   &	21.12  &  81.91  &  29.56 \\
 ROUGE-2   &	17.98  &  64.62  &  24.61 \\
 ROUGE-L   &	24.47  &  73.78  &  33.28 \\
 ROUGE-SU4 &	18.67  &  66.19  &  25.40 \\
LEXRANK1.TXT:
 ROUGE-1   &	22.90  &  79.31  &  30.98 \\
 ROUGE-2   &	19.04  &  60.95  &  25.07 \\
 ROUGE-L   &	25.56  &  70.82  &  33.81 \\
 ROUGE-SU4 &	19.87  &  62.54  &  25.97 \\
TFIDF1.TXT:
 ROUGE-1   &	22.88  &  80.41  &  31.11 \\
 ROUGE-2   &	19.42  &  63.72  &  25.81 \\
 ROUGE-L   &	26.44  &  72.38  &  34.93 \\
 ROUGE-SU4 &	20.21  &  65.16  &  26.66 \\
FASTTEXT1.TXT:
 ROUGE-1   &	22.44  &  80.57  &  30.71 \\
 ROUGE-2   &	18.87  &  62.82  &  25.20 \\
 ROUGE-L   &	25.21  &  72.70  &  33.83 \\
 ROUGE-SU4 &	19.67  &  64.43  &  26.08 \\
WNLT1.TXT:
 ROUGE-1   &	21.73  &  81.80  &  30.19 \\
 ROUGE-2   &	18.33  &  63.04  &  24.83 \\
 ROUGE-L   &	24.48  &  73.63  &  33.32 \\
 ROUGE-SU4 &	19.04  &  64.77  &  25.64 \\
```

## Playgound

In [None]:
human_summaries_df.head()

In [None]:
strnumTo3dgt = lambda x: f"{int(x):03d}" #integer num to 3 digits
getfileID = lambda fn: strnumTo3dgt(fn.split('_',1)[0]) #extract a file number

In [None]:
from collections import Counter
counts = Counter([getfileID(fn) for fn in human_summaries_df["Article"]])
len(counts), Counter(counts.values())

In [None]:
len(human_summaries_df["Article"])

In [None]:
len(sorted(counts.keys()))

In [None]:
for k in sorted(counts.keys()):
  print(k,end=' ')

In [None]:
def check_summaries(num_summaries):
  count_summaries = [getfileID(id) for id, count in counts.items() if count==num_summaries]
  count_summarizer = Counter([s.upper().strip() for s in human_summaries_df['Summariser']])
  articles = {}
  summarisers = {}
  for row in range(len(human_summaries_df["Article"])):
    article_id = getfileID(human_summaries_df["Article"][row])
    if article_id in count_summaries:
      summariser = human_summaries_df["Summariser"][row]
      articles.setdefault(article_id, []).append((row, summariser))
  return articles, count_summarizer
  # return {id:len(Counter(summarisers)) for id, summarisers in articles.items()}

for k, v in sorted(check_summaries(2)[1].items()):
  print(f"{k}={v}; ", end='')