In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


initial zip file loading

In [None]:
import zipfile
import os

# path to ZIP file in Google Drive
zip_path = '/content/drive/MyDrive/cnn_dailymail_dataset.zip'

# Unzip the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/data')

# Check if files are extracted correctly
os.listdir('/content/data')


['__MACOSX', 'cnn_dailymail_dataset']

##Quality data segregation

In [None]:
import os
import random
import pandas as pd
import numpy as np
import random

In [None]:
# Paths to articles and summaries
articles_path = '/content/data/cnn_dailymail_dataset/Articles'
summaries_path = '/content/data/cnn_dailymail_dataset/Summaries'

# Lists to hold articles and summaries
articles = []
summaries = []

# Read each file and store content
for filename in tqdm(os.listdir(articles_path)):
    with open(os.path.join(articles_path, filename), 'r', encoding='utf-8') as file:
        articles.append(file.read())

for filename in tqdm(os.listdir(summaries_path)):
    with open(os.path.join(summaries_path, filename), 'r', encoding='utf-8') as file:
        summaries.append(file.read())


100%|██████████| 287113/287113 [01:45<00:00, 2725.29it/s]
100%|██████████| 287113/287113 [00:13<00:00, 21815.60it/s]


In [None]:
# Create a DataFrame
df = pd.DataFrame({'article': articles, 'summary': summaries})

# Display the DataFrame
df.head()

Unnamed: 0,article,summary
0,'Liberation didn't happen': Feminist academic ...,"Carer Janet Maddocks, 57, was caught red-hand..."
1,By . Peter Campbell and Tamara Cohen . PUBLISH...,"Kate, 40, reveals her beauty tips and how to l..."
2,Brazil's sports tribunal says Chile playmaker ...,CNN's Kara Devlin was diagnosed with hyperemes...
3,By . Anna Hodgekiss . A woman given a vagina g...,More seniors are making music in their golden ...
4,"By . Olivia Williams . PUBLISHED: . 15:17 EST,...",Jimmy Greaves did not play in the 1966 World C...


In [None]:
# Helper function to read text files
def read_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.read()

# Define paths
articles_path = '/content/data/cnn_dailymail_dataset/Articles'
summaries_path = '/content/data/cnn_dailymail_dataset/Summaries'

# Get list of all files
all_article_files = sorted(os.listdir(articles_path))
all_summary_files = sorted(os.listdir(summaries_path))

# Ensure both lists are of the same length
assert len(all_article_files) == len(all_summary_files)

In [None]:

# Set parameters for batch processing
total_files = len(all_article_files)
batch_size = int(0.1 * total_files)  # Process 10% of files at a time, adjust as needed
article_min_words = 250
summary_min_words = 10

# Shuffle the files for randomness (optional)
np.random.seed(42)
indices = np.random.permutation(total_files)

# Initialize list to store batch-wise results
batch_results = []

# Batch-wise processing
for i in range(0, total_files, batch_size):
    print(f"Processing batch {i // batch_size + 1}...")

    # Get the current batch of files
    batch_indices = indices[i:i + batch_size]
    batch_article_files = [all_article_files[idx] for idx in batch_indices]
    batch_summary_files = [all_summary_files[idx] for idx in batch_indices]

    # Initialize lists to store filtered articles and summaries
    filtered_articles = []
    filtered_summaries = []

    # Process each article-summary pair in the current batch
    for article_file, summary_file in zip(batch_article_files, batch_summary_files):
        article_text = read_file(os.path.join(articles_path, article_file))
        summary_text = read_file(os.path.join(summaries_path, summary_file))

        # Check if both article and summary meet the minimum word count requirements
        if len(article_text.split()) >= article_min_words and len(summary_text.split()) >= summary_min_words:
            filtered_articles.append(article_text)
            filtered_summaries.append(summary_text)

    # Create a DataFrame for the current batch
    batch_df = pd.DataFrame({'article': filtered_articles, 'summary': filtered_summaries})

    # Drop duplicates within the current batch
    batch_df.drop_duplicates(subset=['article', 'summary'], inplace=True)

    # Store the filtered batch
    batch_results.append(batch_df)

    # Optionally, save the current batch to a CSV file
    batch_df.to_csv(f'/content/drive/My Drive/CNN_Dataset_Preprocessing/batch_{i // batch_size + 1}.csv', index=False)

# Combine all batches into one DataFrame
combined_df = pd.concat(batch_results, ignore_index=True)

# Further deduplicate after combining batches
combined_df.drop_duplicates(subset=['article', 'summary'], inplace=True)
print(f"Total cleaned article-summary pairs: {len(combined_df)}")

# Save the final combined DataFrame to a CSV file
combined_df.to_csv('/content/drive/My Drive/CNN_Dataset_Preprocessing/final_cleaned_dataset.csv', index=False)

# Create directories to save the final cleaned text files
os.makedirs('/content/final_cleaned_data/Articles', exist_ok=True)
os.makedirs('/content/final_cleaned_data/Summaries', exist_ok=True)

# Save the cleaned articles and summaries back to text files
for index, row in combined_df.iterrows():
    # Save article
    article_filename = f"/content/final_cleaned_data/Articles/article_{index}.txt"
    with open(article_filename, 'w', encoding='utf-8') as file:
        file.write(row['article'])

    # Save summary
    summary_filename = f"/content/final_cleaned_data/Summaries/summary_{index}.txt"
    with open(summary_filename, 'w', encoding='utf-8') as file:
        file.write(row['summary'])

print("Batch processing and filtering completed successfully!")


Processing batch 1...
Processing batch 2...
Processing batch 3...
Processing batch 4...
Processing batch 5...
Processing batch 6...
Processing batch 7...
Processing batch 8...
Processing batch 9...
Processing batch 10...
Processing batch 11...
Total cleaned article-summary pairs: 269710
Batch processing and filtering completed successfully!


we will do Quality-Based Filtering using:

**Summarization Ratio**: To ensure that the summary is a meaningful representation of the article. For example, you can filter out pairs where the summary is too short relative to the article length.

**Unique Words Count**: To ensure the content richness, we can filter out articles that have too few unique words, which might indicate redundancy or low-quality content.


In [None]:
import pandas as pd
import os

In [None]:
# Load the final cleaned dataset
file_path = '/content/drive/My Drive/CNN_Dataset_Preprocessing/first_cleaned_dataset.csv'
df = pd.read_csv(file_path)

In [None]:
# Function to calculate summarization ratio
def summarization_ratio(article, summary):
    article_len = len(article.split())
    summary_len = len(summary.split())
    return summary_len / article_len if article_len > 0 else 0

# Function to count unique words in a text
def unique_word_count(text):
    return len(set(text.split()))


In [None]:
# Define thresholds for filtering
min_ratio = 0.05  # Summarization ratio threshold (summary should be at least 5% of the article length)
min_unique_words = 50  # Minimum number of unique words in the article


In [None]:
# Step 1: Summarization Ratio Filter
df['summarization_ratio'] = df.apply(lambda row: summarization_ratio(row['article'], row['summary']), axis=1)

# Filter out pairs where the summarization ratio is below the threshold
df = df[df['summarization_ratio'] >= min_ratio]


In [None]:
# Step 2: Apply Unique Words Count Filter
df['unique_word_count'] = df['article'].apply(unique_word_count)

# Filter out pairs where the article has fewer unique words than the threshold
df = df[df['unique_word_count'] >= min_unique_words]


In [None]:
# Drop temporary columns used for filtering
df.drop(columns=['summarization_ratio', 'unique_word_count'], inplace=True)

# Display the number of remaining pairs after filtering
print(f"Total pairs after Quality-Based Filtering: {len(df)}")


Total pairs after Quality-Based Filtering: 213770


More quality filtering

In [None]:
# Update thresholds for more aggressive filtering
min_ratio = 0.10  # Increase summarization ratio to 10%
min_unique_words = 100  # Increase unique words count to 100

# Apply updated summarization ratio filter
df['summarization_ratio'] = df.apply(lambda row: summarization_ratio(row['article'], row['summary']), axis=1)
df_filtered = df[df['summarization_ratio'] >= min_ratio].copy()

# Apply updated unique word count filter
df_filtered['unique_word_count'] = df_filtered['article'].apply(unique_word_count)
df_filtered = df_filtered[df_filtered['unique_word_count'] >= min_unique_words].copy()

# Drop temporary columns
df_filtered.drop(columns=['summarization_ratio', 'unique_word_count'], inplace=True)

# Display updated counts
print(f"Total pairs after more aggressive Quality-Based Filtering: {len(df_filtered)}")

# Save the aggressively filtered dataset
df_filtered.to_csv('/content/drive/My Drive/CNN_Dataset_Preprocessing/aggressively_filtered_dataset.csv', index=False)


Total pairs after more aggressive Quality-Based Filtering: 78833


In [None]:
import pandas as pd

# Load the aggressively filtered dataset
df_filtered = pd.read_csv('/content/drive/My Drive/CNN_Dataset_Preprocessing/aggressively_filtered_dataset.csv')

# Check the shape of the loaded dataset
print(f"Loaded dataset shape: {df_filtered.shape}")


Loaded dataset shape: (78833, 2)


In [None]:
import os

# Create directories for articles and summaries
os.makedirs('/content/drive/My Drive/CNN_Dataset_Preprocessing/Filtered_Dataset/Articles', exist_ok=True)
os.makedirs('/content/drive/My Drive/CNN_Dataset_Preprocessing/Filtered_Dataset/Summaries', exist_ok=True)

# Save each article and summary as separate text files
for idx, row in df_filtered.iterrows():
    article_filename = f"/content/drive/My Drive/CNN_Dataset_Preprocessing/Filtered_Dataset/Articles/article_{idx}.txt"
    summary_filename = f"/content/drive/My Drive/CNN_Dataset_Preprocessing/Filtered_Dataset/Summaries/summary_{idx}.txt"

    with open(article_filename, 'w', encoding='utf-8') as file:
        file.write(row['article'])

    with open(summary_filename, 'w', encoding='utf-8') as file:
        file.write(row['summary'])

print("Articles and Summaries saved successfully in separate folders!")


Articles and Summaries saved successfully in separate folders!


##Preprocessing

In [None]:
import torch

# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")


Using device: cuda


In [None]:
!pip install --upgrade pip setuptools

Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Collecting setuptools
  Downloading setuptools-75.4.0-py3-none-any.whl.metadata (6.8 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading setuptools-75.4.0-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: setuptools, pip
  Attempting uninstall: setuptools
    Found existing installation: setuptools 75.1.0
    Uninstalling setuptools-75.1.0:
      Successfully uninstalled setuptools-75.1.0
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behavio

In [None]:
# Install necessary libraries
!pip install nltk scikit-learn spacy gensim pyspellchecker langdetect

# Download additional NLTK resources
import nltk
nltk.download('punkt')  # Make sure this is correctly downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Download Spacy model for NER
!python -m spacy download en_core_web_sm



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m103.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Import necessary libraries after downloading
import re
import pandas as pd
import spacy
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from spellchecker import SpellChecker
from langdetect import detect, LangDetectException
from collections import Counter
from nltk import pos_tag
from nltk.chunk import ne_chunk

# Load Spacy's English NER model
spacy_nlp = spacy.load("en_core_web_sm")

# Initialize spell checker, lemmatizer, stemmer, and stop words
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
spell = SpellChecker()


###Data Cleaning

Lowercase Transformation: Convert text to lowercase to ensure uniformity.

In [None]:
def convert_to_lowercase(text):
    """Convert text to lowercase."""
    return text.lower()

Remove URLs and Emails: Eliminate any web addresses or email addresses if present.

In [None]:
def remove_urls(text):
    """Remove URLs from the text."""
    return re.sub(r'http\S+|www\S+', '', text)

def remove_emails(text):
    """Remove email addresses from the text."""
    return re.sub(r'\S+@\S+', '', text)


Expand Contractions: Convert contractions to their full forms (e.g., "can't" to "cannot") to
ensure consistency.

In [None]:
def expand_contractions(text):
    """Expand common contractions."""
    contractions = {
        "can't": "cannot", "won't": "will not", "n't": " not", "'re": " are", "'s": " is",
        "'d": " would", "'ll": " will", "'t": " not", "'ve": " have", "'m": " am"
    }
    for contraction, full_form in contractions.items():
        text = re.sub(contraction, full_form, text)
    return text

Remove Special Characters and Punctuation: Remove non-alphanumeric characters (like @, #,
&, etc.), punctuation, and any irrelevant symbols.

In [None]:
def remove_special_characters(text):
    """Remove special characters and punctuation."""
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

Text Normalization: Standardize text by removing extra spaces, tabs, and newline characters.

In [None]:
def remove_extra_spaces(text):
    """Remove extra spaces, tabs, and newlines."""
    return re.sub(r'\s+', ' ', text).strip()

Correct Misspellings

In [None]:
def correct_spellings(text):
    """Correct misspelled words."""
    corrected_words = []
    for word in text.split():
        correction = spell.correction(word)
        # If correction is None, use the original word
        corrected_words.append(correction if correction else word)
    return ' '.join(corrected_words)


Remove Stop Words: remove common but less
informative words (e.g., "is," "and," "the").

In [None]:
def remove_stopwords(text):
    """Remove stopwords from the text."""
    return ' '.join([word for word in text.split() if word not in stop_words])

In [None]:
def clean_text(text):
    """Apply all cleaning functions in sequence."""
    text = convert_to_lowercase(text)
    text = remove_urls(text)
    text = remove_emails(text)
    text = expand_contractions(text)
    text = remove_special_characters(text)
    text = remove_extra_spaces(text)
    text = correct_spellings(text)
    text = remove_stopwords(text)
    return text


In [None]:
import pandas as pd

# Load your dataset
df = pd.read_csv('/content/drive/My Drive/CNN_Dataset_Preprocessing/aggressively_filtered_dataset.csv')

# Clean articles and summaries
df['article_cleaned'] = df['article'].apply(clean_text)
df['summary_cleaned'] = df['summary'].apply(clean_text)

# Save cleaned data
df.to_csv('/content/drive/My Drive/CNN_Dataset_Preprocessing/cleaned_dataset.csv', index=False)

print(f"Total pairs after comprehensive cleaning: {len(df)}")


In [None]:
df = pd.read_csv('/content/drive/MyDrive/CNN_Dataset_Preprocessing/cleaned_sampled_dataset.csv')


In [None]:
# Sentence Tokenization
def sentence_tokenize(text):
    return sent_tokenize(text)

def spacy_sentence_tokenize(text):
    doc = spacy_nlp(text)
    return [sent.text for sent in doc.sents]

# Word Tokenization
def word_tokenize_text(text):
    return word_tokenize(text)

# Define a Spacy-based word tokenizer
def spacy_word_tokenize(text):
    doc = spacy_nlp(text)
    return [token.text for token in doc]

# Lemmatization
def lemmatize_words(words):
    return [lemmatizer.lemmatize(word) for word in words]

# Stemming
def stem_words(words):
    return [stemmer.stem(word) for word in words]

# POS Tagging using Spacy
def spacy_pos_tagging(words_list):
    return [(token.text, token.pos_) for token in spacy_nlp(" ".join(words_list))]

# Named Entity Recognition using Spacy
def named_entity_recognition(text):
    doc = spacy_nlp(text)
    entities = [(entity.text, entity.label_) for entity in doc.ents]
    return entities

# Vectorization using TF-IDF
def tfidf_vectorize(text_series):
    vectorizer = TfidfVectorizer(max_features=5000)
    return vectorizer.fit_transform(text_series).toarray()

# Word2Vec Embeddings function using Spacy
def generate_word2vec_embeddings(text_series):
    tokenized_sentences = []
    for text in text_series:
        # Use Spacy for sentence tokenization
        doc = spacy_nlp(text)
        # Tokenize each sentence
        tokenized_sentences.append([token.text for token in doc if not token.is_punct])

    # Train Word2Vec model
    word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=2, workers=4)
    return word2vec_model


In [None]:
# Sentence Tokenization
df['article_sentences'] = df['article_cleaned'].apply(spacy_sentence_tokenize)
df['summary_sentences'] = df['summary_cleaned'].apply(spacy_sentence_tokenize)

# Apply Spacy-based word tokenizer
df['article_words'] = df['article_sentences'].apply(lambda sentences: [spacy_word_tokenize(sent) for sent in sentences])
df['summary_words'] = df['summary_sentences'].apply(lambda sentences: [spacy_word_tokenize(sent) for sent in sentences])

# Lemmatization
df['article_words_lemmatized'] = df['article_words'].apply(lambda words_list: [lemmatize_words(words) for words in words_list])
df['summary_words_lemmatized'] = df['summary_words'].apply(lambda words_list: [lemmatize_words(words) for words in words_list])

# Stemming (optional)
df['article_words_stemmed'] = df['article_words'].apply(lambda words_list: [stem_words(words) for words in words_list])

# Named Entity Recognition
df['article_entities'] = df['article_cleaned'].apply(named_entity_recognition)

# TF-IDF Vectorization
tfidf_matrix = tfidf_vectorize(df['article_cleaned'])
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

# Apply the updated function
word2vec_model = generate_word2vec_embeddings(df['article_cleaned'])

# Apply the updated Spacy-based POS tagging
df['article_pos'] = df['article_words'].apply(lambda words_list: [spacy_pos_tagging(words) for words in words_list])

TF-IDF Matrix Shape: (1000, 5000)


In [None]:
# Specify the path where you want to save the cleaned dataset
final_save_path = '/content/drive/My Drive/CNN_Dataset_Preprocessing/final_cleaned_dataset.csv'

# Save the fully processed DataFrame to CSV
df.to_csv(final_save_path, index=False)

print(f"Final cleaned dataset saved successfully to: {final_save_path}")


Final cleaned dataset saved successfully to: /content/drive/My Drive/CNN_Dataset_Preprocessing/final_cleaned_dataset.csv


In [None]:
import numpy as np

# Specify the path
tfidf_save_path = '/content/drive/My Drive/CNN_Dataset_Preprocessing/tfidf_matrix.npy'

# Save as a NumPy array
np.save(tfidf_save_path, tfidf_matrix)

print(f"TF-IDF matrix saved successfully to: {tfidf_save_path}")


TF-IDF matrix saved successfully to: /content/drive/My Drive/CNN_Dataset_Preprocessing/tfidf_matrix.npy


In [None]:
# Specify the path
word2vec_save_path = '/content/drive/My Drive/CNN_Dataset_Preprocessing/word2vec_model.model'

# Save the Word2Vec model
word2vec_model.save(word2vec_save_path)

print(f"Word2Vec model saved successfully to: {word2vec_save_path}")


Word2Vec model saved successfully to: /content/drive/My Drive/CNN_Dataset_Preprocessing/word2vec_model.model
