In [None]:
!pip install transformers

In [None]:
import requests
from transformers import AutoTokenizer
import pandas as pd

import nltk
nltk.download("stopwords")
nltk.download("punkt")
import re
import string

from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize as nltk_sent_tokenize
from nltk.tokenize import word_tokenize as nltk_word_tokenize

from sklearn.feature_extraction.text import CountVectorizer , TfidfTransformer
import numpy as np
from nltk.corpus import stopwords

from scipy.spatial.distance import cosine

In [None]:
def fetch_and_save_wiki(title):
  response = requests.get(
      "https://en.wikipedia.org/w/api.php",
      params={
          "action":"query",
          "format":"json",
          "titles": title,
          "prop":"extracts",
          "explaintext": True
      },
  ).json()

  page = next(iter(response["query"]["pages"].values()))
  wiki_text = page["extract"]

  return wiki_text

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
def clean_text(text):
  text = re.sub(r'[^A-Za-z0-9\s.\(\)[\]{\}]+' ,'', text)
  text = text.lower()
  text = " ".join(text.split())
  return text

def count_tokens(text):
  tokens = tokenizer.encode(text , add_special_tokens = True)
  return len(tokens)

In [None]:
soccar_player =[
    "Lionel Messi",
    "Cristiano Ronaldo",
    "Ronaldo",
    "Zinedine Zidane",
    "Ronaldinho",
    "Robert Lewandowski",
    "Lothar Matthäus",
    "Marco van Basten",
    "Roberto Baggio",
    "Romário",
    "George Weah",
    "Rivaldo",
    "Luís Figo",
    "Fabio Cannavaro",
    "Kaká",
    "Luka Modrić",
]

data = []

for player in soccar_player:
  info = fetch_and_save_wiki(player)
  tokens = tokenizer.encode(info , add_special_tokens = True , truncation= True , max_length = 30000)
  num_tokens = len(tokens)
  data.append([player , info , num_tokens])


In [None]:
# data

In [None]:
df = pd.DataFrame(data , columns = ["soccar_player" , "player_information" , "num_tokens"])
df.head()

In [None]:
df.to_csv("world_soccar_player.csv")

In [None]:
df['player_information'][0]

In [None]:
df["player_clean_information"] = df['player_information'].apply(clean_text)

In [None]:
df.head()

In [None]:
df['player_clean_information'] = df['player_clean_information'].astype(str)

In [None]:
# Sentence tokenization:
def sent_tokenize(text):
  sents = nltk_sent_tokenize(text)
  # print(sents)
  sent_filtered = []
  for s in sents:
    sent_filtered.append(s)
  return sent_filtered

#Cleaned sentance function
def cleanup_sentences(text):
  stop_words = set(stopwords.words("english"))
  sentences = sent_tokenize(text)
  sentences_cleaned = []
  for sent in sentences:
    words = nltk_word_tokenize(sent)
    words = [w for w in words if w not in string.punctuation]
    words = [w for w in words if not w.lower() in stop_words]
    words = [w.lower() for w in words]
    sentences_cleaned.append(" ".join(words))
  return sentences_cleaned

In [None]:
df['cleaned_sentences'] = df["player_clean_information"].apply(cleanup_sentences)

In [None]:
df.head()

In [None]:
# tfidf

def get_tf_idf(sentences):
  vectorizer = CountVectorizer()
  sent_word_matrix = vectorizer.fit_transform(sentences)

  transformer = TfidfTransformer(norm = None , sublinear_tf= False , smooth_idf= False)
  tfidf = transformer.fit_transform(sent_word_matrix)
  tfidf = tfidf.toarray()

  # Calculate the centroid vector
  centroid_vector = tfidf.sum(axis = 0)
  centroid_vector = np.divide(centroid_vector , centroid_vector.max())

  feature_names = vectorizer.get_feature_names_out()

  relevant_vector_indices = np.where(centroid_vector > 0.3)[0]

  word_list = [feature_names[idx] for idx in relevant_vector_indices]

  return word_list

In [None]:
df['most_important_words'] = df['cleaned_sentences'].apply(get_tf_idf)
df.head()

In [None]:
def word_vector_cache(sentences , embedding_model):
  word_vectors = dict()
  for sent in sentences:
    words = nltk_word_tokenize(sent)
    for w in words:
      word_vectors.update({w: embedding_model.wv[w]})
    return word_vectors

In [None]:
def build_embedding_representation(words , word_vectors ,embedding_model):
  embedding_representation = np.zeros(embedding_model.vector_size , dtype = "float32")
  word_vectors_key = set(word_vectors.keys())

  count = 0
  for w in words:
    if w in word_vectors_key:
      embedding_representation = embedding_representation + word_vectors[w]
      count += 1
  if count != 0:
    embedding_representation = np.divide(embedding_representation , count)
  return embedding_representation


In [None]:
def similarity(v1 , v2):
  score = 0.0
  if np.count_nonzero(v1) != 0 and np.count_nonzero(v2) != 0:
    score = ((1 - cosine(v1,v2)) + 1) / 2
  return score

In [None]:
def summarize(text , embedding_model):
  raw_sentences = sent_tokenize(text)
  clean_sentences = cleanup_sentences(text)
  for i , s in enumerate(raw_sentences):
    print(i,s)
  for i , s in enumerate(clean_sentences):
    print(i,s)

  centroid_words = get_tf_idf(clean_sentences)
  print(len(centroid_words) , centroid_words)
  word_vectors = word_vector_cache(clean_sentences , embedding_model)
  #centroid embedding representation
  centroid_vector = build_embedding_representation(centroid_words , word_vectors ,embedding_model)
  sentences_score = []

  for i in range(len(clean_sentences)):
    scores = []
    words = clean_sentences[i].split()

    #Sentence embedding representation
    sentence_vector = build_embedding_representation(words , word_vectors ,embedding_model)

    #Cosine Similarity between sentence vector and centroid vectors
    score = similarity(sentence_vector ,centroid_vector )
    sentences_score.append((i ,raw_sentences[i] , score , sentence_vector))

  sentences_score_sort = sorted(sentences_score , key = lambda el: el[2] , reverse = True)

  for s in sentences_score_sort:
    print(s[0] , s[1] , s[2])

  count = 0
  sentence_summary = []
  for s in sentences_score_sort:
    if count > 100:
      break
    include_flag = True
    for ps in sentence_summary:
      sim = similarity(s[3] , ps[3])
      if sim > 0.95:
        include_flag = False
    if include_flag:
      sentence_summary.append(s)
      count += len(s[1].split())

    sentence_summary = sorted(sentence_summary , key = lambda el: el[0] , reverse = False)

  summary = "\n".join(s[1] for s in sentence_summary)
  # print(summary)
  return summary

In [None]:
df['cleaned_sentences'] = df['cleaned_sentences'].astype(str)

sentences = [nltk.word_tokenize(sent) for sent in df['cleaned_sentences'].values]

model = Word2Vec(sentences , min_count = 1 , sg = 1)

df['summary'] = df['cleaned_sentences'].apply(lambda x:summarize(x ,model))

In [None]:
df['summary'][0]

In [None]:
df.head()

In [None]:
df["Summary_token_count"] = df['summary'].apply(count_tokens)

In [None]:
import plotly.express as px
# import pandas as pd

# Assuming df is your DataFrame containing 'num_tokens' and 'Summary_token_count' columns

# Create the scatter plot using Plotly Express
fig = px.scatter(df, x='num_tokens', y='Summary_token_count', opacity=0.8, size_max=32)

# Update the layout to remove top and right spines
fig.update_layout(xaxis=dict(showline=True, linecolor='black', showgrid=True, gridcolor='lightgrey'),
                  yaxis=dict(showline=True, linecolor='black', showgrid=True, gridcolor='lightgrey'),
                  showlegend=False)

# Show the plot
fig.show()


In [None]:
df.head()

In [None]:
df['summary'][3]

In [None]:
!pip install rouge

In [None]:
from rouge import Rouge
import sys
sys.setrecursionlimit(10000)  # Set the recursion limit to a higher value

# from rouge import Rouge

# Initialize ROUGE
rouge = Rouge()

# Define lists to store ROUGE scores
rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []

# Iterate through each row in the dataframe
for index, row in df.iterrows():
    # Get the summary and player information for the current row
    summary = row['summary']
    player_information = row['player_information']

    # Calculate ROUGE score for the current data point
    scores = rouge.get_scores(summary, player_information)

    # Extract ROUGE-1, ROUGE-2, and ROUGE-L scores
    rouge_1_score = scores[0]['rouge-1']['f']
    rouge_2_score = scores[0]['rouge-2']['f']
    rouge_l_score = scores[0]['rouge-l']['f']

    # Append scores to respective lists
    rouge_1_scores.append(rouge_1_score)
    rouge_2_scores.append(rouge_2_score)
    rouge_l_scores.append(rouge_l_score)

# Add new columns for ROUGE scores to the dataframe
df['rouge_1_score'] = rouge_1_scores
df['rouge_2_score'] = rouge_2_scores
df['rouge_l_score'] = rouge_l_scores



In [None]:
df.head()

In [None]:
import plotly.express as px

# Create a scatter plot with Plotly
fig = px.scatter(df, x='num_tokens', y='rouge_1_score', size_max=32, opacity=0.8,
                 title='Scatter Plot of num_tokens vs rouge_1_score',
                 labels={'num_tokens': 'Number of Tokens', 'rouge_1_score': 'ROUGE-1 Score'})

# Hide the top and right spines
fig.update_layout(xaxis=dict(showline=True, showgrid=False, zeroline=False),
                  yaxis=dict(showline=True, showgrid=False, zeroline=False))

# Update the figure with the new column
fig.add_scatter(x=df['num_tokens'], y=df['Summary_token_count'], mode='markers', marker=dict(color='red'), name='Summary_token_count')

# Show the interactive plot
fig.show()
