In [4]:
# Text Summarization Using NLP
import pandas as pd
import numpy as np
import nltk
import re
nltk.download('punkt')
nltk.download('punkt_tab')
df = pd.read_csv('/Users/patash/PSTB/Week_5_NLP/day_5/tennis_articles.csv', encoding='ISO-8859_1')

[nltk_data] Downloading package punkt to /Users/patash/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/patash/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   article_id     8 non-null      int64 
 1   article_title  8 non-null      object
 2   article_text   8 non-null      object
 3   source         8 non-null      object
dtypes: int64(1), object(3)
memory usage: 388.0+ bytes
None


In [6]:
print(df.head())

   article_id                                      article_title  \
0           1  I do not have friends in tennis, says Maria Sh...   
1           2  Federer defeats Medvedev to advance to 14th Sw...   
2           3  Tennis: Roger Federer ignored deadline set by ...   
3           4  Nishikori to face off against Anderson in Vien...   
4           5  Roger Federer has made this huge change to ten...   

                                        article_text  \
0  Maria Sharapova has basically no friends as te...   
1  BASEL, Switzerland (AP)  Roger Federer advanc...   
2  Roger Federer has revealed that organisers of ...   
3  Kei Nishikori will try to end his long losing ...   
4  Federer, 37, first broke through on tour over ...   

                                              source  
0  https://www.tennisworldusa.org/tennis/news/Mar...  
1  http://www.tennis.com/pro-game/2018/10/copil-s...  
2  https://scroll.in/field/899938/tennis-roger-fe...  
3  http://www.tennis.com/pro-game/

In [7]:
# Remove Unnecessary Columns
df = df.drop(columns=['article_title', 'source', 'article_id'])

print(df.info()) 
print(df.head()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   article_text  8 non-null      object
dtypes: object(1)
memory usage: 196.0+ bytes
None
                                        article_text
0  Maria Sharapova has basically no friends as te...
1  BASEL, Switzerland (AP)  Roger Federer advanc...
2  Roger Federer has revealed that organisers of ...
3  Kei Nishikori will try to end his long losing ...
4  Federer, 37, first broke through on tour over ...


Text Preprocessing

In [8]:
# Tokenize
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

df_tokenized = df['article_text'].apply(nltk.sent_tokenize)
print(df_tokenized[0])

['Maria Sharapova has basically no friends as tennis players on the WTA Tour.', "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.", 'I think everyone knows this is my job here.', "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.", "So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.", "I'm a pretty competitive girl.", "I say my hellos, but I'm not sending any players flowers as well.", "Uhm, I'm not really friendly or close to many players.", "I have not a lot of friends away from the courts.'", 'When she said she is not really close to a lot of players, is that something strategic that she is doing?', "Is it different on the men's tour than the women's tour?", "'No, not at all.", "I 

[nltk_data] Downloading package punkt to /Users/patash/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# GloVe Word Embeddings
# Load GloVe vectors 
import numpy as np
from tqdm import tqdm  # Barre de progression

def load_glove_embeddings(file_path):
    glove_embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Loading GloVe embeddings"):  # Affichage de la progression
            values = line.strip().split()
            word = values[0]  # Le mot
            vector = np.array(values[1:], dtype='float32')  # Le vecteur
            glove_embeddings[word] = vector
    return glove_embeddings

glove_embeddings = load_glove_embeddings('glove.6B.100d.txt')
print("Loaded", len(glove_embeddings), "words into embeddings.")

Loading GloVe embeddings: 400000it [00:06, 61240.97it/s]


Loaded 400000 words into embeddings.


In [17]:
# Cleaning 
import re
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_sentence(sentence):

  sentence = re.sub(r'[^\w\s]', '', sentence) # Ponctuation
  sentence = re.sub(r'\d+', '', sentence) # Chiffres
  sentence = sentence.lower()
  words = sentence.split()
  words = [word for word in words if word not in stop_words]

  return sentence

sentences = df_tokenized.explode().tolist()

cleaned_sentences = [clean_sentence(sentence) for sentence in sentences]
print(cleaned_sentences[0])

maria sharapova has basically no friends as tennis players on the wta tour


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# Vectorize sentences: average of the word embeddings
# Function to get sentence embedding
def get_sentence_embedding(sentence):
    words = sentence.split()  # Split cleaned sentence into words
    word_vectors = [glove_embeddings[word] for word in words if word in glove_embeddings]  
    if len(word_vectors) == 0:
        return np.zeros(100)  # Return a zero vector if no words have embeddings
    return np.mean(word_vectors, axis=0)  # Compute the average of word vectors

In [19]:
# Convert cleaned sentences into embeddings
sentence_embeddings = np.array([get_sentence_embedding(sentence) for sentence in cleaned_sentences])

print(sentence_embeddings.shape)

(130, 100)


In [23]:
# Similarity Matrix 

from sklearn.metrics.pairwise import cosine_similarity

cos_similarity_matrix = cosine_similarity(sentence_embeddings)
print(cos_similarity_matrix)
print(cos_similarity_matrix.shape)

[[1.0000001  0.8819135  0.8373998  ... 0.9248717  0.872359   0.81378967]
 [0.8819135  1.0000001  0.9307503  ... 0.896164   0.93493533 0.8766065 ]
 [0.8373998  0.9307503  1.0000001  ... 0.84453756 0.9069677  0.81943476]
 ...
 [0.9248717  0.896164   0.84453756 ... 0.99999976 0.8930172  0.8519911 ]
 [0.872359   0.93493533 0.9069677  ... 0.8930172  0.99999964 0.86473227]
 [0.81378967 0.8766065  0.81943476 ... 0.8519911  0.86473227 0.9999999 ]]
(130, 130)


In [25]:
# Graph Construction
import networkx as nx

G = nx.Graph()

# Add nodes (each sentence is a node)
for i, sentence in enumerate(cleaned_sentences):
    G.add_node(i, text=sentence)

# Define similarity threshold (higher = fewer edges)
threshold = 0.75

 # Add edges based on cos_similarity_matrix
for i in range(len(cleaned_sentences)):
    for j in range(i + 1, len(cleaned_sentences)):  # Avoid duplicate pairs
        if cos_similarity_matrix[i][j] > threshold:  # Use your existing similarity matrix
            G.add_edge(i, j, weight=cos_similarity_matrix[i][j])  # Edge with similarity score

print(f"Total Nodes: {G.number_of_nodes()}")
print(f"Total Edges: {G.number_of_edges()}")

Total Nodes: 130
Total Edges: 8001


In [26]:
# Sentence Ranking 
# Apply PageRank algorithm to rank sentences based on importance
sentence_scores = nx.pagerank(G, alpha=0.85,  weight='weight')

# Convert scores into a sorted list (higher score = more important)
sorted_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)

# Print the top-ranked sentences (optional)
print("\n Top Ranked Sentences (by PageRank Score):")
for i, (idx, score) in enumerate(sorted_sentences[:5]):  # Displaying top 5 sentences
    print(f"{i+1}. (Score: {score:.4f}) - {cleaned_sentences[idx]}")


 Top Ranked Sentences (by PageRank Score):
1. (Score: 0.0082) - so im not the one to strike up a conversation about the weather and know that in the next few minutes i have to go and try to win a tennis match
2. (Score: 0.0082) - i was on a nice trajectorythen reid recalledif i hadnt got sick i think i could have started pushing towards the second week at the slams and then who knows duringa comeback attempt some five years later reid added bernard tomic and  us open federer slayer john millman to his list of career scalps
3. (Score: 0.0082) - i just felt like it really kind of changed where people were a little bit definitely in the s a lot more quiet into themselves and then it started to become better meanwhile federer is hoping he can improve his service game as he hunts his ninth swiss indoors title this week
4. (Score: 0.0082) - speaking at the swiss indoors tournament where he will play in sundays final against romanian qualifier marius copil the world number three said that gi

In [28]:
# Summarization

N = 3
top_sentences = [cleaned_sentences[idx] for idx, _ in sorted_sentences[:N]]

# Step 4: Print Final Summary
print(" ".join(top_sentences))

so im not the one to strike up a conversation about the weather and know that in the next few minutes i have to go and try to win a tennis match i was on a nice trajectorythen reid recalledif i hadnt got sick i think i could have started pushing towards the second week at the slams and then who knows duringa comeback attempt some five years later reid added bernard tomic and  us open federer slayer john millman to his list of career scalps i just felt like it really kind of changed where people were a little bit definitely in the s a lot more quiet into themselves and then it started to become better meanwhile federer is hoping he can improve his service game as he hunts his ninth swiss indoors title this week
