In [58]:
import numpy as np
import pandas as pd

train_data = pd.read_csv('resources/train.csv')
test_data = pd.read_csv('resources/test.csv')
val_data = pd.read_csv('resources/validation.csv')

KeyboardInterrupt: 

## Testing data 

In [45]:
train_data.head(n=5)

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [46]:
test_data.head(n=5)

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [47]:
val_data.head(n=5)

Unnamed: 0,id,article,highlights
0,61df4979ac5fcc2b71be46ed6fe5a46ce7f071c3,"Sally Forrest, an actress-dancer who graced th...","Sally Forrest, an actress-dancer who graced th..."
1,21c0bd69b7e7df285c3d1b1cf56d4da925980a68,A middle-school teacher in China has inked hun...,Works include pictures of Presidential Palace ...
2,56f340189cd128194b2e7cb8c26bb900e3a848b4,A man convicted of killing the father and sist...,"Iftekhar Murtaza, 29, was convicted a year ago..."
3,00a665151b89a53e5a08a389df8334f4106494c2,Avid rugby fan Prince Harry could barely watch...,Prince Harry in attendance for England's crunc...
4,9f6fbd3c497c4d28879bebebea220884f03eb41a,A Triple M Radio producer has been inundated w...,Nick Slater's colleagues uploaded a picture to...


## Preprocessing

### Necessary NLTK data files 

In [48]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [49]:
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Preload stopwords, stemmer, and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)



In [50]:
train_data_subset = train_data.copy().head(n=5)
train_data_subset['processed_article'] = train_data_subset['article'].swifter.apply(preprocess_text)
train_data_subset.head()


Pandas Apply:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,id,article,highlights,processed_article
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ...",associ press publish 1411 est 25 octob 2013 up...
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...,cnn ralph mata intern affair lieuten miamidad ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t...",drunk driver kill young woman headon crash che...
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...,cnn breezi sweep pen presid vladimir putin wro...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...,fleetwood team still 100 record sky bet leagu ...


## Convert text into embeddings(TF-IDF)

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 most important words

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(train_data_subset['processed_article'])

# Convert to DataFrame for better readability
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Show first few rows of TF-IDF embeddings
print(tfidf_df.head())


         10       100        11      1411        15      1536      1992  \
0  0.000000  0.000000  0.000000  0.063727  0.000000  0.063727  0.000000   
1  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.040969   
2  0.000000  0.000000  0.021329  0.000000  0.000000  0.000000  0.000000   
3  0.000000  0.000000  0.000000  0.000000  0.044107  0.000000  0.000000   
4  0.038786  0.038786  0.031292  0.000000  0.000000  0.000000  0.000000   

         20      2005      2010  ...     would   wreckag   wrongdo     wrote  \
0  0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.000000  0.000000   
1  0.000000  0.000000  0.040969  ...  0.027438  0.000000  0.040969  0.000000   
2  0.000000  0.000000  0.000000  ...  0.053114  0.026436  0.000000  0.000000   
3  0.000000  0.044107  0.000000  ...  0.118155  0.000000  0.000000  0.044107   
4  0.038786  0.000000  0.000000  ...  0.000000  0.000000  0.000000  0.000000   

   yarmouth      year    yeovil  yesterday       yet     young  
0  

### Find the most important word

In [52]:
top_words = tfidf_df.max().sort_values(ascending=False).head(10)
top_words

mata             0.491631
drive            0.343673
fargo            0.318637
dioces           0.318637
bishop           0.318637
ecclestontodd    0.317236
europ            0.308746
complaint        0.286785
goal             0.271500
russia           0.264640
dtype: float64

### Save process text in to csv file

In [53]:
tfidf_df.to_csv('resources/tfidf_matrix.csv')

## TextRank Implementation

In [57]:
from summa import summarizer

def text_rank_summarize(text, ratio=0.2):
    """
    Summarize text using TextRank.
    :param text: The input text (article).
    :param ratio: Fraction of sentences to keep (default: 20%).
    :return: Summarized text.
    """
    summary = summarizer.summarize(text, ratio=ratio)
    return summary

# Apply TextRank on preprocessed text
train_data_subset['summary'] = train_data_subset['processed_article'].apply(lambda x: text_rank_summarize(x, ratio=0.2))

# Print example summary
train_data_subset[['processed_article', 'summary']].head()


Unnamed: 0,processed_article,summary
0,associ press publish 1411 est 25 octob 2013 up...,
1,cnn ralph mata intern affair lieuten miamidad ...,
2,drunk driver kill young woman headon crash che...,
3,cnn breezi sweep pen presid vladimir putin wro...,
4,fleetwood team still 100 record sky bet leagu ...,
