<a href="https://colab.research.google.com/github/VISHNUVARDHAN2730/NLP-4080/blob/main/2403A54080_Lab_Assignment_05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Using NLTK**

# **Load the dataset**

In [None]:
import pandas as pd
df = pd.read_csv("/content/arxiv_data.csv", engine='python', nrows=1000)
df.head()

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


# **Clean the data**

In [None]:
import re

def clean_data(data):
    data = re.sub(r'http\S+|www\S+|https\S+', '', data, flags=re.MULTILINE)
    data = re.sub(r'<.*?>', '', data)
    data = re.sub(r'@\w+', '', data)
    data = re.sub(r'#\w+', '', data)
    data = data.lower()
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F1E0-\U0001F1FF"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    data = emoji_pattern.sub(r'', data)
    data = re.sub(r'[^a-zA-Z0-9\s]', '', data)
    data = re.sub(r'\s+', ' ', data).strip()
    return data

In [None]:
df['processed_summaries'] = df['summaries'].apply(clean_data)
print(df[['summaries', 'processed_summaries']].head())

                                           summaries  \
0  Stereo matching is one of the widely used tech...   
1  The recent advancements in artificial intellig...   
2  In this paper, we proposed a novel mutual cons...   
3  Consistency training has proven to be an advan...   
4  To ensure safety in automated driving, the cor...   

                                 processed_summaries  
0  stereo matching is one of the widely used tech...  
1  the recent advancements in artificial intellig...  
2  in this paper we proposed a novel mutual consi...  
3  consistency training has proven to be an advan...  
4  to ensure safety in automated driving the corr...  


# **Word Tokenization**

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

df['tokenized_summaries'] = df['processed_summaries'].apply(word_tokenize)
print(df[['processed_summaries', 'tokenized_summaries']].head())

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


                                 processed_summaries  \
0  stereo matching is one of the widely used tech...   
1  the recent advancements in artificial intellig...   
2  in this paper we proposed a novel mutual consi...   
3  consistency training has proven to be an advan...   
4  to ensure safety in automated driving the corr...   

                                 tokenized_summaries  
0  [stereo, matching, is, one, of, the, widely, u...  
1  [the, recent, advancements, in, artificial, in...  
2  [in, this, paper, we, proposed, a, novel, mutu...  
3  [consistency, training, has, proven, to, be, a...  
4  [to, ensure, safety, in, automated, driving, t...  


# **Stop Word Removal**

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df['filtered_summaries'] = df['tokenized_summaries'].apply(remove_stopwords)
print(df[['tokenized_summaries', 'filtered_summaries']].head())

                                 tokenized_summaries  \
0  [stereo, matching, is, one, of, the, widely, u...   
1  [the, recent, advancements, in, artificial, in...   
2  [in, this, paper, we, proposed, a, novel, mutu...   
3  [consistency, training, has, proven, to, be, a...   
4  [to, ensure, safety, in, automated, driving, t...   

                                  filtered_summaries  
0  [stereo, matching, one, widely, used, techniqu...  
1  [recent, advancements, artificial, intelligenc...  
2  [paper, proposed, novel, mutual, consistency, ...  
3  [consistency, training, proven, advanced, semi...  
4  [ensure, safety, automated, driving, correct, ...  


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Lemmatization**

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

df['lemmatized_summaries'] = df['filtered_summaries'].apply(lemmatize_tokens)
print(df[['filtered_summaries', 'lemmatized_summaries']].head())

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                  filtered_summaries  \
0  [stereo, matching, one, widely, used, techniqu...   
1  [recent, advancements, artificial, intelligenc...   
2  [paper, proposed, novel, mutual, consistency, ...   
3  [consistency, training, proven, advanced, semi...   
4  [ensure, safety, automated, driving, correct, ...   

                                lemmatized_summaries  
0  [stereo, matching, one, widely, used, techniqu...  
1  [recent, advancement, artificial, intelligence...  
2  [paper, proposed, novel, mutual, consistency, ...  
3  [consistency, training, proven, advanced, semi...  
4  [ensure, safety, automated, driving, correct, ...  


# **Rejoining**

In [None]:
df['clean_summaries'] = df['lemmatized_summaries'].apply(lambda x: ' '.join(x))
print(df[['summaries', 'clean_summaries']].head())

                                           summaries  \
0  Stereo matching is one of the widely used tech...   
1  The recent advancements in artificial intellig...   
2  In this paper, we proposed a novel mutual cons...   
3  Consistency training has proven to be an advan...   
4  To ensure safety in automated driving, the cor...   

                                     clean_summaries  
0  stereo matching one widely used technique infe...  
1  recent advancement artificial intelligence ai ...  
2  paper proposed novel mutual consistency networ...  
3  consistency training proven advanced semisuper...  
4  ensure safety automated driving correct percep...  


# **PoS Tagging**

In [None]:
import nltk
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger_eng')

df['pos_tagged_summaries'] = df['lemmatized_summaries'].apply(pos_tag)
print(df[['lemmatized_summaries', 'pos_tagged_summaries']].head())

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


                                lemmatized_summaries  \
0  [stereo, matching, one, widely, used, techniqu...   
1  [recent, advancement, artificial, intelligence...   
2  [paper, proposed, novel, mutual, consistency, ...   
3  [consistency, training, proven, advanced, semi...   
4  [ensure, safety, automated, driving, correct, ...   

                                pos_tagged_summaries  
0  [(stereo, NN), (matching, VBG), (one, CD), (wi...  
1  [(recent, JJ), (advancement, JJ), (artificial,...  
2  [(paper, NN), (proposed, VBN), (novel, JJ), (m...  
3  [(consistency, NN), (training, VBG), (proven, ...  
4  [(ensure, VB), (safety, NN), (automated, VBN),...  


# **Noun Phrase Frequency**

In [None]:
from nltk.chunk import RegexpParser

grammar = r"""
  NP: {<DT>?<JJ>*<NN.*>+}
"""

np_parser = RegexpParser(grammar)

df['chunked_summaries'] = df['pos_tagged_summaries'].apply(np_parser.parse)
print(df[['pos_tagged_summaries', 'chunked_summaries']].head())

                                pos_tagged_summaries  \
0  [(stereo, NN), (matching, VBG), (one, CD), (wi...   
1  [(recent, JJ), (advancement, JJ), (artificial,...   
2  [(paper, NN), (proposed, VBN), (novel, JJ), (m...   
3  [(consistency, NN), (training, VBG), (proven, ...   
4  [(ensure, VB), (safety, NN), (automated, VBN),...   

                                   chunked_summaries  
0  [[(stereo, NN)], (matching, VBG), (one, CD), (...  
1  [[(recent, JJ), (advancement, JJ), (artificial...  
2  [[(paper, NN)], (proposed, VBN), [(novel, JJ),...  
3  [[(consistency, NN)], (training, VBG), (proven...  
4  [(ensure, VB), [(safety, NN)], (automated, VBN...  


In [None]:
from collections import Counter

def extract_nps(tree):
    noun_phrases = []
    for subtree in tree:
        if isinstance(subtree, nltk.tree.Tree) and subtree.label() == 'NP':
            np_words = [word for word, tag in subtree.leaves()]
            noun_phrases.append(' '.join(np_words))
    return noun_phrases

df['extracted_nps'] = df['chunked_summaries'].apply(extract_nps)
all_noun_phrases = [np for nps_list in df['extracted_nps'] for np in nps_list]
np_counts = Counter(all_noun_phrases)
print("\nTop 20 Most Frequent Noun Phrases:")
print(np_counts.most_common(20))


Top 20 Most Frequent Noun Phrases:
[('method', 272), ('image', 206), ('model', 187), ('approach', 133), ('data', 131), ('medical image segmentation', 121), ('image segmentation', 114), ('segmentation', 110), ('semantic segmentation', 103), ('performance', 91), ('datasets', 86), ('task', 82), ('paper', 80), ('network', 79), ('medical image', 72), ('semantic image segmentation', 67), ('result', 63), ('deep learning', 63), ('framework', 62), ('neural network', 56)]


# **NLTK Pipeline Function**

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

def nltk_preprocessing_pipeline(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)

    text = text.lower()

    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F1E0-\U0001F1FF"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    tokenized_words = word_tokenize(text)
    filtered_words = [word for word in tokenized_words if word not in stop_words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    clean_summary = ' '.join(lemmatized_words)

    return clean_summary

print("NLTK preprocessing pipeline function created successfully!")

NLTK preprocessing pipeline function created successfully!


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
df['clean_summaries_pipeline'] = df['summaries'].apply(nltk_preprocessing_pipeline)
print("\nComparison of previous clean_summaries and new clean_summaries_pipeline (first 5 rows):")
print(df[['clean_summaries', 'clean_summaries_pipeline']].head())


Comparison of previous clean_summaries and new clean_summaries_pipeline (first 5 rows):
                                     clean_summaries  \
0  stereo matching one widely used technique infe...   
1  recent advancement artificial intelligence ai ...   
2  paper proposed novel mutual consistency networ...   
3  consistency training proven advanced semisuper...   
4  ensure safety automated driving correct percep...   

                            clean_summaries_pipeline  
0  stereo matching one widely used technique infe...  
1  recent advancement artificial intelligence ai ...  
2  paper proposed novel mutual consistency networ...  
3  consistency training proven advanced semisuper...  
4  ensure safety automated driving correct percep...  


# **Using spaCy**

# **Loading the dataset**

In [None]:
import pandas as pd
df = pd.read_csv("/content/arxiv_data.csv", engine='python', nrows=1000)
display(df.head())

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


# **Cleaning the data**

In [None]:
import re

def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)

    text = text.lower()

    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F1E0-\U0001F1FF"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
df['processed_summaries'] = df['summaries'].apply(preprocess_text)
print(df[['summaries', 'processed_summaries']].head())

                                           summaries  \
0  Stereo matching is one of the widely used tech...   
1  The recent advancements in artificial intellig...   
2  In this paper, we proposed a novel mutual cons...   
3  Consistency training has proven to be an advan...   
4  To ensure safety in automated driving, the cor...   

                                 processed_summaries  
0  stereo matching is one of the widely used tech...  
1  the recent advancements in artificial intellig...  
2  in this paper we proposed a novel mutual consi...  
3  consistency training has proven to be an advan...  
4  to ensure safety in automated driving the corr...  


# **Word Tokenization**

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

def tokenize_text_spacy(text):
    doc = nlp(text)
    return [token.text for token in doc]

df['tokenized_summaries'] = df['processed_summaries'].apply(tokenize_text_spacy)

print(df[['processed_summaries', 'tokenized_summaries']].head())

                                 processed_summaries  \
0  stereo matching is one of the widely used tech...   
1  the recent advancements in artificial intellig...   
2  in this paper we proposed a novel mutual consi...   
3  consistency training has proven to be an advan...   
4  to ensure safety in automated driving the corr...   

                                 tokenized_summaries  
0  [stereo, matching, is, one, of, the, widely, u...  
1  [the, recent, advancements, in, artificial, in...  
2  [in, this, paper, we, proposed, a, novel, mutu...  
3  [consistency, training, has, proven, to, be, a...  
4  [to, ensure, safety, in, automated, driving, t...  


# **Stop Word Removal**

In [None]:
def remove_stopwords_spacy(tokens):
    return [token.text for token in nlp(' '.join(tokens)) if not token.is_stop]

df['summaries_no_stopwords'] = df['tokenized_summaries'].apply(remove_stopwords_spacy)
print(df[['tokenized_summaries', 'summaries_no_stopwords']].head())

                                 tokenized_summaries  \
0  [stereo, matching, is, one, of, the, widely, u...   
1  [the, recent, advancements, in, artificial, in...   
2  [in, this, paper, we, proposed, a, novel, mutu...   
3  [consistency, training, has, proven, to, be, a...   
4  [to, ensure, safety, in, automated, driving, t...   

                              summaries_no_stopwords  
0  [stereo, matching, widely, techniques, inferri...  
1  [recent, advancements, artificial, intelligenc...  
2  [paper, proposed, novel, mutual, consistency, ...  
3  [consistency, training, proven, advanced, semi...  
4  [ensure, safety, automated, driving, correct, ...  


# **Lemmatization**

In [None]:
def lemmatize_text_spacy(tokens):
    doc = nlp(' '.join(tokens))
    return [token.lemma_ for token in doc]

df['lemmatized_summaries'] = df['summaries_no_stopwords'].apply(lemmatize_text_spacy)
print(df[['summaries_no_stopwords', 'lemmatized_summaries']].head())

                              summaries_no_stopwords  \
0  [stereo, matching, widely, techniques, inferri...   
1  [recent, advancements, artificial, intelligenc...   
2  [paper, proposed, novel, mutual, consistency, ...   
3  [consistency, training, proven, advanced, semi...   
4  [ensure, safety, automated, driving, correct, ...   

                                lemmatized_summaries  
0  [stereo, matching, widely, technique, infer, d...  
1  [recent, advancement, artificial, intelligence...  
2  [paper, propose, novel, mutual, consistency, n...  
3  [consistency, training, prove, advanced, semis...  
4  [ensure, safety, automate, drive, correct, per...  


# **Rejoining**

In [None]:
def rejoin_summaries(lemmas):
    return ' '.join(lemmas)

df['final_summaries'] = df['lemmatized_summaries'].apply(rejoin_summaries)
print(df[['lemmatized_summaries', 'final_summaries']].head())

                                lemmatized_summaries  \
0  [stereo, matching, widely, technique, infer, d...   
1  [recent, advancement, artificial, intelligence...   
2  [paper, propose, novel, mutual, consistency, n...   
3  [consistency, training, prove, advanced, semis...   
4  [ensure, safety, automate, drive, correct, per...   

                                     final_summaries  
0  stereo matching widely technique infer depth s...  
1  recent advancement artificial intelligence ai ...  
2  paper propose novel mutual consistency network...  
3  consistency training prove advanced semisuperv...  
4  ensure safety automate drive correct perceptio...  


# **Noun Phrase Frequencies**

In [None]:
def extract_noun_phrases(text):
    doc = nlp(text)
    return [chunk.lemma_ for chunk in doc.noun_chunks]

df['noun_phrases'] = df['final_summaries'].apply(extract_noun_phrases)
print(df[['final_summaries', 'noun_phrases']].head())

                                     final_summaries  \
0  stereo matching widely technique infer depth s...   
1  recent advancement artificial intelligence ai ...   
2  paper propose novel mutual consistency network...   
3  consistency training prove advanced semisuperv...   
4  ensure safety automate drive correct perceptio...   

                                        noun_phrases  
0  [stereo matching, widely technique infer depth...  
1  [recent advancement artificial intelligence, e...  
2  [paper, novel mutual consistency network mcnet...  
3  [consistency training, advanced semisupervise ...  
4  [safety automate drive correct perception situ...  


In [None]:
from collections import Counter

all_noun_phrases = []
for phrases_list in df['noun_phrases']:
    all_noun_phrases.extend(phrases_list)

noun_phrase_counts = Counter(all_noun_phrases)

print("Top 10 most frequent noun phrases:")
for noun_phrase, count in noun_phrase_counts.most_common(10):
    print(f"'{noun_phrase}': {count}")

Top 10 most frequent noun phrases:
'method': 55
'algorithm': 51
'deep learning': 38
'paper': 37
'propose method': 37
'deep neural network': 30
'image segmentation': 30
'image': 27
'approach': 26
'performance': 25
