In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [1]:
# cd /content/gdrive/MyDrive/Inlp-Project/Datasets/

[Errno 2] No such file or directory: '/content/gdrive/MyDrive/Inlp-Project/Datasets/'
/content


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import argparse
import numpy as np
import pandas as pd
import pickle
import nltk
import spacy
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU, Embedding
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence
import torch
from torch.utils.data import Dataset, DataLoader
import torchtext

In [4]:
nltk.download('stopwords')
nltk.download('punkt')
sw = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:


class TextProcessor:
    def __init__(self, data_file: str, freq_threshold: int = 5):
        """
        Initializes a TextProcessor object.

        Args:
            data_file (str): The path to the CSV file containing the text data.
            freq_threshold (int, optional): The frequency threshold for word occurrence. Defaults to 5.
        """
        self.data_file = data_file
        self.dataset = pd.read_csv(data_file)
        self.freq_threshold = freq_threshold

    def process_text(self) -> tuple:
        """
        Processes the text data.

        Returns:
            tuple: A tuple containing:
                - token_mappings (list of lists): Token mappings for each sentence in the dataset.
                - reverse_word_mapper (dict): Mapping from index to word.
        """
        spacy.prefer_gpu()

        text_tokenizer = torchtext.data.utils.get_tokenizer('spacy', language='en_core_web_sm')
        tokenized_text = []
        sentence_groups = []
        for entry in self.dataset['text'].tolist():
            tokenized_text.append(text_tokenizer(entry))
            sentence_groups.append(entry.split('.'))

        self.dataset['grouped_sentences'] = sentence_groups

        word_counter = Counter()
        for line in tokenized_text:
            for word in line:
                word_counter[word] += 1

        word_mapper = {word[0]: idx+1 for idx, word in enumerate(word_counter.most_common())}
        reverse_word_mapper = {idx+1: word[0] for idx, word in enumerate(word_counter.most_common())}
        other_index = len(word_counter.keys())

        token_mappings = []
        for line in tokenized_text:
            mapped_line = []
            for word in line:
                mapped_line.append(word_mapper.get(word, other_index))
            token_mappings.append(mapped_line)

        return token_mappings, reverse_word_mapper





In [6]:
def compute_paragraph_similarity(dataset: pd.DataFrame) -> pd.DataFrame:
    """
    Computes the similarity scores between consecutive sentences within each paragraph.

    Args:
        dataset (pd.DataFrame): The dataset containing text data.

    Returns:
        pd.DataFrame: The dataset with similarity scores added as a new column.
    """
    similarity_scores = []
    for paragraph in dataset['grouped_sentences'].tolist():
        similarity = 200
        initial_sentence = paragraph[0]
        paragraph = paragraph[1:]
        for sentence in paragraph:
            tokenized_initial = word_tokenize(initial_sentence)
            tokenized_current = word_tokenize(sentence)

            sw = set(stopwords.words('english'))
            initial_set = {w for w in tokenized_initial if not w in sw}
            current_set = {w for w in tokenized_current if not w in sw}

            combined_set = initial_set.union(current_set)
            vector1 = [1 if w in initial_set else 0 for w in combined_set]
            vector2 = [1 if w in current_set else 0 for w in combined_set]

            dot_product = sum(i * j for i, j in zip(vector1, vector2))
            try:
                cosine_similarity = dot_product / float((sum(vector1)*sum(vector2))**0.5)
                if similarity > cosine_similarity:
                    similarity = cosine_similarity
            except ZeroDivisionError:
                similarity += 0

            initial_sentence = sentence

        similarity_scores.append(similarity)

    dataset['similarity_scores'] = similarity_scores

    return dataset

In [8]:
WikiCnn = pd.read_csv('/content/drive/MyDrive/WikiCnn.csv')


In [9]:
WikiCnn.describe()



Unnamed: 0,label,train,file_id,sen_position
count,179084.0,179084.0,179084.0,179084.0
mean,0.523894,0.873428,51808.790115,0.688236
std,0.49943,0.332494,30375.208021,2.006377
min,0.0,0.0,1.0,-1.0
25%,0.0,1.0,25693.0,-1.0
50%,1.0,1.0,51375.0,-1.0
75%,1.0,1.0,77055.25,2.0
max,1.0,1.0,111290.0,7.0


In [10]:
WikiCnn.head()

Unnamed: 0,ctx,label,to_be_replaced,train,file_id,replace_with,sen_position,ctx-replaced
0,"Estramustine (INN, USAN, BAN) (brand names Emc...",0,"It is a derivative of estradiol, an estrogen, ...",0,2202,"It is the L-alanine ester of estramustine, whi...",1,"Estramustine (INN, USAN, BAN) (brand names Emc..."
1,L² Puppis (also known as HD 56096) is a giant ...,0,This is most likely an asymptotic giant branch...,0,1034,It has an apparent visual magnitude of 6.26.,3,L² Puppis (also known as HD 56096) is a giant ...
2,David John (Davy/Davey) Gunn (1887-1955) was a...,0,"He covered the 90 kilometre, four day journey ...",0,4180,Gunn played a role in the implementation of th...,3,David John (Davy/Davey) Gunn (1887-1955) was a...
3,Olivia Hussey (born Olivia Osuna; 17 April 195...,0,"She is also well known for her role as Mary, t...",0,1932,She is best known for her collaborations with ...,1,Olivia Hussey (born Olivia Osuna; 17 April 195...
4,"Tailapa II, or Taila, (r.973–997 CE) (or Ahava...",0,He re-established the Western Chalukya dynasty...,0,8118,This Kannadiga dynasty is sometimes called the...,1,"Tailapa II, or Taila, (r.973–997 CE) (or Ahava..."


In [11]:
WikiCnn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179084 entries, 0 to 179083
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   ctx             179084 non-null  object
 1   label           179084 non-null  int64 
 2   to_be_replaced  85263 non-null   object
 3   train           179084 non-null  int64 
 4   file_id         179084 non-null  int64 
 5   replace_with    85263 non-null   object
 6   sen_position    179084 non-null  int64 
 7   ctx-replaced    85263 non-null   object
dtypes: int64(4), object(4)
memory usage: 10.9+ MB


In [14]:
wikicnn_train = TextProcessor('/content/drive/MyDrive/WikiCnn.csv')


In [15]:
wikicnn_train.dataset = wikicnn_train.dataset.drop(['label','to_be_replaced','train','file_id','replace_with','sen_position'],axis = 1)
wikicnn_train.dataset

Unnamed: 0,ctx,ctx-replaced
0,"Estramustine (INN, USAN, BAN) (brand names Emc...","Estramustine (INN, USAN, BAN) (brand names Emc..."
1,L² Puppis (also known as HD 56096) is a giant ...,L² Puppis (also known as HD 56096) is a giant ...
2,David John (Davy/Davey) Gunn (1887-1955) was a...,David John (Davy/Davey) Gunn (1887-1955) was a...
3,Olivia Hussey (born Olivia Osuna; 17 April 195...,Olivia Hussey (born Olivia Osuna; 17 April 195...
4,"Tailapa II, or Taila, (r.973–997 CE) (or Ahava...","Tailapa II, or Taila, (r.973–997 CE) (or Ahava..."
...,...,...
179079,"-LRB- CNN -RRB- If you listen to rock or pop, ...",
179080,"-LRB- CNN -RRB- -- Pirates have struck again, ...",
179081,-LRB- CNN -RRB- -- Russia will begin the const...,
179082,-LRB- CNN -RRB- -- The man police say kidnappe...,


In [16]:
pwd

'/content'

In [17]:
# Create a copy of the 'ctx' column for positive data
positive_data = wikicnn_train.dataset[['ctx']].copy()
positive_labels = [1] * len(positive_data)
positive_data = positive_data.rename(columns={'ctx': 'text'})
positive_data['label'] = positive_labels

# Create a copy of the 'ctx-replaced' column for negative data
negative_data = wikicnn_train.dataset[['ctx-replaced']].copy()
negative_data = negative_data.rename(columns={'ctx-replaced': 'text'})
negative_data = negative_data.dropna()  # Remove any rows with missing text
negative_labels = [0] * len(negative_data)
negative_data['label'] = negative_labels

# Combine the positive and negative datasets
wikicnn_train.dataset = pd.concat([positive_data, negative_data])
wikicnn_train.dataset


Unnamed: 0,text,label
0,"Estramustine (INN, USAN, BAN) (brand names Emc...",1
1,L² Puppis (also known as HD 56096) is a giant ...,1
2,David John (Davy/Davey) Gunn (1887-1955) was a...,1
3,Olivia Hussey (born Olivia Osuna; 17 April 195...,1
4,"Tailapa II, or Taila, (r.973–997 CE) (or Ahava...",1
...,...,...
142301,"-LRB- Fast Company -RRB- -- For years, employ...",0
142302,Paris -LRB- CNN -RRB- -- France will start wit...,0
142303,-LRB- CNN -RRB- -- Pinterest is the breakout s...,0
142304,New York -LRB- CNN -RRB- -- Officer Rafael Ram...,0


In [18]:
wikicnn_train_mapping, inv_wikicnn_train_mapping = wikicnn_train.process_text()
print("Preprocessing complete. Mapping and inverse mapping obtained.")

wikicnn_train.dataset['encoding'] = wikicnn_train_mapping
print("Encoding column added to the DataFrame.")
print("Updated DataFrame with encoding:\n", wikicnn_train.dataset.head())

wikicnn_train.dataset = compute_paragraph_similarity(wikicnn_train.dataset)
print("Data transformed using the similarity_paragraph function.")
print("Transformed Data:\n", wikicnn_train.dataset.head())


Preprocessing complete. Mapping and inverse mapping obtained.
Encoding column added to the DataFrame.
Updated DataFrame with encoding:
                                                 text  label  \
0  Estramustine (INN, USAN, BAN) (brand names Emc...      1   
1  L² Puppis (also known as HD 56096) is a giant ...      1   
2  David John (Davy/Davey) Gunn (1887-1955) was a...      1   
3  Olivia Hussey (born Olivia Osuna; 17 April 195...      1   
4  Tailapa II, or Taila, (r.973–997 CE) (or Ahava...      1   

                                   grouped_sentences  \
0  [Estramustine (INN, USAN, BAN) (brand names Em...   
1  [L² Puppis (also known as HD 56096) is a giant...   
2  [David John (Davy/Davey) Gunn (1887-1955) was ...   
3  [Olivia Hussey (born Olivia Osuna; 17 April 19...   
4  [Tailapa II, or Taila, (r, 973–997 CE) (or Aha...   

                                            encoding  
0  [186844, 20, 15781, 1, 23095, 1, 25587, 19, 20...  
1  [186849, 59002, 20, 47, 66, 18, 871

In [19]:
# Randomly select 80% of the data for the training set using a fixed random seed for reproducibility.
training_data = wikicnn_train.dataset.sample(frac=0.8, random_state=200)

# Create the testing set by removing the indices selected for the training set from the original dataset.
testing_data = wikicnn_train.dataset.drop(training_data.index)


In [20]:
training_data

Unnamed: 0,text,label,grouped_sentences,encoding,similarity_scores
157493,-LRB- CNN -RRB- -- World football 's governing...,1,[-LRB- CNN -RRB- -- World football 's governin...,"[31, 32, 33, 27, 137, 228, 15, 3505, 448, 2498...",0.100000
61962,Turrilitidae is a family of extinct heteromor...,1,[Turrilitidae is a family of extinct heteromo...,"[334529, 6, 10, 8, 151, 4, 3613, 334530, 26669...",0.000000
94326,Digidogheadlock is the eighth album by Japanes...,1,[Digidogheadlock is the eighth album by Japane...,"[391326, 10, 2, 2986, 105, 21, 644, 207, 12, 8...",0.091287
118888,"London, England -LRB- CNN -RRB- -- The Britis...",1,"[London, England -LRB- CNN -RRB- -- The Briti...","[282, 1, 6, 279, 31, 32, 33, 27, 12, 189, 2704...",0.000000
154898,-LRB- CNN -RRB- -- Inside the Charles Manson r...,1,[-LRB- CNN -RRB- -- Inside the Charles Manson ...,"[31, 32, 33, 27, 6254, 2, 966, 11985, 1290, 26...",0.113228
...,...,...,...,...,...
108315,-LRB- CNN -RRB- -- Nurse Kaci Hickox and her b...,1,[-LRB- CNN -RRB- -- Nurse Kaci Hickox and her ...,"[31, 32, 33, 27, 20756, 52483, 34419, 5, 56, 5...",0.108465
131663,"Cairo, Egypt -LRB- CNN -RRB- -- Authorities i...",1,"[Cairo, Egypt -LRB- CNN -RRB- -- Authorities ...","[2854, 1, 6, 1262, 31, 32, 33, 27, 2117, 7, 12...",0.120386
155532,-LRB- CNN -RRB- -- Pontiac lovers are feeling ...,1,[-LRB- CNN -RRB- -- Pontiac lovers are feeling...,"[31, 32, 33, 27, 20311, 9063, 36, 3822, 28318,...",0.070014
115363,Russian Prime Minister Dmitry Medvedev has sig...,1,[Russian Prime Minister Dmitry Medvedev has si...,"[559, 950, 484, 14097, 13352, 34, 921, 8, 9321...",0.000000


In [21]:
testing_data

Unnamed: 0,text,label,grouped_sentences,encoding,similarity_scores
33,The swimming competitions at the 2016 Summer O...,1,[The swimming competitions at the 2016 Summer ...,"[12, 4251, 4012, 26, 2, 441, 1787, 1247, 7, 27...",0.157135
56,The Orlando Shakespeare Theater is a theater c...,1,[The Orlando Shakespeare Theater is a theater ...,"[12, 4419, 6551, 5477, 10, 8, 2777, 174, 149, ...",0.077152
67,"Histocompatibility, or tissue compatibility, i...",1,"[Histocompatibility, or tissue compatibility, ...","[126479, 1, 41, 5333, 22527, 1, 10, 2, 1174, 4...",0.000000
86,"William Morris (January 1, 1861 – January 11, ...",1,"[William Morris (January 1, 1861 – January 11,...","[650, 5247, 20, 247, 158, 1, 6889, 176, 247, 4...",0.000000
97,Alphonse Areola (born 27 February 1993) is a F...,1,[Alphonse Areola (born 27 February 1993) is a ...,"[26763, 126496, 20, 96, 726, 331, 1120, 19, 10...",0.000000
...,...,...,...,...,...
142202,Tokyo -LRB- CNN -RRB- -- Japanese Prime Minist...,0,[Tokyo -LRB- CNN -RRB- -- Japanese Prime Minis...,"[2166, 31, 32, 33, 27, 644, 950, 484, 34407, 1...",0.000000
142231,-LRB- CNN -RRB- -- With a first name that mean...,0,[-LRB- CNN -RRB- -- With a first name that mea...,"[31, 32, 33, 27, 627, 8, 49, 125, 23, 795, 14,...",0.133333
142269,-LRB- CNN -RRB- -- As about 2 % of babies born...,0,[-LRB- CNN -RRB- -- As about 2 % of babies bor...,"[31, 32, 33, 27, 218, 68, 193, 338, 4, 7117, 9...",0.055048
142302,Paris -LRB- CNN -RRB- -- France will start wit...,0,[Paris -LRB- CNN -RRB- -- France will start wi...,"[927, 31, 32, 33, 27, 506, 69, 768, 14830, 980...",0.000000


In [24]:
# training_data.to_csv('/content/gdrive/MyDrive/Inlp-Project/Datasets/training_data.csv')
# testing_data.to_csv('/content/gdrive/MyDrive/Inlp-Project/Datasets/testing_data.csv')

In [22]:
np.random.seed(7)

import tensorflow as tf

# Prepare the training data: pad the sequences to a maximum length of 500
X_train = sequence.pad_sequences(training_data['encoding'], maxlen=500)
print("Training data shapes (X_train):", X_train.shape)

# Get the labels for the training data and convert them to one-hot encoded vectors
y_train = training_data['label']
y_train = tf.one_hot(y_train, depth=2)
print("Training labels shapes (y_train):", y_train.shape)

# Prepare the testing data: similarly, pad the sequences to a maximum length of 500
X_test = sequence.pad_sequences(testing_data['encoding'], maxlen=500)
print("Testing data shapes (X_test):", X_test.shape)

# Get the labels for the testing data and convert them to one-hot encoded vectors
y_test = testing_data['label']
y_test = tf.one_hot(y_test, depth=2)
print("Testing labels shapes (y_test):", y_test.shape)


Training data shapes (X_train): (211478, 500)
Training labels shapes (y_train): (211478, 2)
Testing data shapes (X_test): (25516, 500)
Testing labels shapes (y_test): (25516, 2)


In [23]:
# Define the length of the embeddings
embedding_vector_length = 32

# Initialize the model as a sequential one
model = Sequential()

# Add an embedding layer configured for a maximum of 40,000 unique words, transforming them into 32-dimensional vectors, optimized for input sequences of 500 tokens
model.add(Embedding(input_dim=440424, output_dim=embedding_vector_length, input_length=500))

# Insert a GRU layer with 32 units, applying a 20% dropout rate to combat overfitting, and set to return sequences for further recurrent processing
model.add(GRU(units=32, dropout=0.2, return_sequences=True))

# Add another GRU layer with 32 units to process the final output of the sequence data
model.add(GRU(32))

# Append a dense output layer with two units and a softmax activation function for binary classification
model.add(Dense(units=2, activation='softmax'))

# Compile the model with the binary crossentropy loss function, using the Adam optimizer, and tracking accuracy as a metric
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Establish a checkpointing callback that saves only the model's weights to a specified path, monitoring the validation accuracy to retain only the best performing weights
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='/content/gdrive/MyDrive/Inlp-Project/saved models/checkpoints',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True
)

# Output the model's architecture summary
print(model.summary())

# Train the model over 30 epochs with batches of 500 samples, using the checkpoint callback to save the best model configuration
model.fit(X_train, y_train, epochs=30, batch_size=512, callbacks=[model_checkpoint_callback])

# Evaluate the model's performance on the test dataset quietly and print the accuracy percentage
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: {:.2f}%".format(scores[1] * 100))


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 32)           128000000 
                                                                 
 gru (GRU)                   (None, 500, 32)           6336      
                                                                 
 gru_1 (GRU)                 (None, 32)                6336      
                                                                 
 dense (Dense)               (None, 2)                 66        
                                                                 
Total params: 128012738 (488.33 MB)
Trainable params: 128012738 (488.33 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/30



Epoch 2/30



Epoch 3/30



Epoch 4/30



Epoch 5/30



Epoch 6/30



Epoch 7/30



Epoch 8/30



Epoch 9/30



Epoch 10/30



Epoch 11/30



Epoch 12/30



Epoch 13/30



Epoch 14/30



Epoch 15/30



Epoch 16/30



Epoch 17/30



Epoch 18/30



Epoch 19/30



Epoch 20/30



Epoch 21/30



Epoch 22/30



Epoch 23/30



Epoch 24/30



Epoch 25/30



Epoch 26/30



Epoch 27/30



Epoch 28/30



Epoch 29/30



Epoch 30/30



Accuracy: 63.75%


In [27]:
# Set the random seed for reproducibility of results
np.random.seed(7)

# Prepare the training and testing data: Pad the encoded sequences to a length of 500
X_train = sequence.pad_sequences(training_data['encoding'], maxlen=500)
X_test = sequence.pad_sequences(testing_data['encoding'], maxlen=500)

# Convert the categorical labels to one-hot encoded vectors with two categories
y_train = tf.one_hot(training_data['label'], depth=2)
y_test = tf.one_hot(testing_data['label'], depth=2)

# Convert similarity scores to numpy arrays and reshape
similarity_train = training_data['similarity_scores'].to_numpy()[:, np.newaxis]
similarity_test = testing_data['similarity_scores'].to_numpy()[:, np.newaxis]

# Append the 'similarity' feature to each sequence in the training and testing sets
X_train = np.append(similarity_train, X_train, axis=1)
X_test = np.append(similarity_test, X_test, axis=1)

# Set the dimension length for embeddings
embedding_vector_length = 32

# Initialize a sequential model
model = Sequential()

# Add an embedding layer configured for a vocabulary of 40,000 words and an input sequence length of 501
model.add(Embedding(input_dim=440424, output_dim=embedding_vector_length, input_length=501))

# Insert a GRU layer with 32 units, including a 20% dropout rate to help prevent overfitting, and configure it to return sequences
model.add(GRU(units=32, dropout=0.2, return_sequences=True))

# Add another GRU layer with 32 units to process sequences to a final output
model.add(GRU(32))

# Include a dense output layer with two units and a softmax activation function to perform binary classification
model.add(Dense(units=2, activation='softmax'))

# Compile the model with the binary crossentropy loss function and the Adam optimizer, tracking accuracy as a performance metric
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display a summary of the model’s architecture
print(model.summary())

# Train the model for 20 epochs using batches of 500 samples
model.fit(X_train, y_train, epochs=20, batch_size=500)

# Evaluate the model's performance on the test dataset and print the accuracy in percentage
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: {:.2f}%".format(scores[1] * 100))


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 501, 32)           14093568  
                                                                 
 gru_2 (GRU)                 (None, 501, 32)           6336      
                                                                 
 gru_3 (GRU)                 (None, 32)                6336      
                                                                 
 dense_1 (Dense)             (None, 2)                 66        
                                                                 
Total params: 14106306 (53.81 MB)
Trainable params: 14106306 (53.81 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoc