# Question 1: Text Summarization
# Mounting the Drive

In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Loading

In [40]:
import pandas as pd
file_path = '/content/drive/MyDrive/NLP/WEEK5/spaceX_DP.txt'
                                                                                # Open and read the text file
with open(file_path, 'r') as file:
    text = file.read()

In [41]:
text                                                                            #Just making sure data loaded correctly or not.



# Importing Spacy and String for the Puntuations and Stop words

In [42]:
                                                                                #Import necessary libraries
import spacy
import string
                                                                                # Load spacy model for stopwords
nlp = spacy.load('en_core_web_sm')

                                                                                # Get stopwords from spacy
stopwords = nlp.Defaults.stop_words
                                                                                # Get punctuation from string
punctuation = string.punctuation

                                                                                # Print the loaded text, stopwords, and punctuation
print("Loaded Text:", text[:200])                                               # printing the first 200 characters of the text
print("Stopwords:", list(stopwords)[:10])                                        # printing first 10 stopwords
print("Punctuation:", punctuation)


Loaded Text: Cape Canaveral this is not.But here, down toward the coast, on a spit of land past the Border Patrol checkpoint, where the Rio Grande meets the Gulf of Mexico, there is a spaceship being assembled off
Stopwords: ['some', 'been', 'hereby', 'those', 'unless', '‘re', 'serious', "'re", 'wherein', 'beforehand']
Punctuation: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


# Tokenizing the text in SpaceX_DP.txt

In [43]:
                                                                                #Tokenize the text using spacy
doc = nlp(text)
                                                                                # Extract tokens
tokens = [token.text for token in doc]

                                                                                # Print the first 20 tokens as an example
print("First 20 tokens:", tokens[:20])


First 20 tokens: ['Cape', 'Canaveral', 'this', 'is', 'not', '.', 'But', 'here', ',', 'down', 'toward', 'the', 'coast', ',', 'on', 'a', 'spit', 'of', 'land', 'past']


# Build 'word frequency'

In [44]:
from collections import Counter

                                                                                #Filter tokens by removing stopwords, punctuation, and any whitespace tokens
tokens = [token.text.lower() for token in doc if token.text.lower() not in stopwords and token.text not in punctuation and not token.is_space]

                                                                                #Use Counter to count the frequency of each token
word_freq = Counter(tokens)
                                                                                #Print the most common words
most_common_words = word_freq.most_common(10)                                   # Get the 10 most common words
print("Top 10 most common words:", most_common_words)


Top 10 most common words: [('spacex', 254), ('“', 216), ('”', 212), ('said', 127), ('boca', 119), ('chica', 112), ('company', 90), ('faa', 87), ('musk', 85), ('residents', 82)]


# Determine the maximum frequency by Normalizing

In [45]:
                                                                               #Find the maximum frequency
maximum_frequency = max(word_freq.values())
                                                                                #Normalize the word frequencies
word_frequencies_normalized = {word: (freq / maximum_frequency) for word, freq in word_freq.items()}
                                                                                # Step 3: Print the normalized word frequencies
word_frequencies_normalized


{'cape': 0.03543307086614173,
 'canaveral': 0.027559055118110236,
 'coast': 0.027559055118110236,
 'spit': 0.007874015748031496,
 'land': 0.05905511811023622,
 'past': 0.031496062992125984,
 'border': 0.1141732283464567,
 'patrol': 0.011811023622047244,
 'checkpoint': 0.01968503937007874,
 'rio': 0.03543307086614173,
 'grande': 0.03543307086614173,
 'meets': 0.003937007874015748,
 'gulf': 0.023622047244094488,
 'mexico': 0.027559055118110236,
 'spaceship': 0.031496062992125984,
 'assembled': 0.015748031496062992,
 'state': 0.09448818897637795,
 'highway': 0.07480314960629922,
 '4': 0.06299212598425197,
 'dead': 0.007874015748031496,
 'ends': 0.003937007874015748,
 'sea': 0.047244094488188976,
 'towering': 0.007874015748031496,
 'stainless': 0.023622047244094488,
 'steel': 0.03543307086614173,
 'shiny': 0.003937007874015748,
 'looks': 0.011811023622047244,
 'like': 0.14960629921259844,
 'surreal': 0.007874015748031496,
 'sculpture': 0.003937007874015748,
 'amid': 0.007874015748031496,
 

# Generate the sentence_scores & Score every sentence based on number of words.

In [46]:
from collections import defaultdict
                                                                                #Tokenize the sentences
sentences = [sent.text for sent in doc.sents]


In [47]:
                                                                                #Initialize sentence_scores
sentence_scores = defaultdict(float)
                                                                                #Score each sentence based on the number of words (excluding stopwords and punctuation)
for sentence in sentences:
    sentence_doc = nlp(sentence)
    word_count = sum(1 for token in sentence_doc if token.text.lower() not in stopwords and token.text not in punctuation)
    sentence_scores[sentence] = word_count
                                                                                #Print the sentence scores
sentence_scores


defaultdict(float,
            {'Cape Canaveral this is not.': 2,
             'But here, down toward the coast, on a spit of land past the Border Patrol checkpoint, where the Rio Grande meets the Gulf of Mexico, there is a spaceship being assembled off State Highway 4 just before it dead-ends into the sea.': 20,
             'Towering and stainless-steel shiny, it looks like a surreal sculpture amid the cactuses, yucca and relentless South Texas sun.': 15,
             'And, because it’s being built not in a factory but out in the open, it’s become a roadside attraction, drawing gawkers to an area so remote that the county trucks in drinking water once a month to the few who live nearby.': 16,
             'They’re': 0,
             'coming to see Elon Musk’s latest creation, a prototype called Starship that he hopes will one day carry people by the dozens to the moon and Mars.': 15,
             'Musk, in a presentation here Saturday, said his goal of building a “rapidly reusable spa

# Import nlargest from heapq and Provide summarized sentences based on sentence score

In [48]:
from heapq import nlargest
                                                                                #Define how many sentences we want in the summary
num_sentences = 5
                                                                                #Use nlargest to get the top-scoring sentences
summarized_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
                                                                                #Print the summarized sentences
summarized_sentences


['The metamorphosis of this facility, which sits on tender tidal flats and feet from sand dunes where sea turtles lay eggs, has largely gone unnoticed and under the radar in this border community where environmentalists say they are strapped between fighting the construction of Donald Trump’s border wall through the region, and the development of three new liquefied natural gas facilities at the deepwater Port of Brownsville just 5 miles from SpaceX. More hangars and buildings have been built 1.5 miles from the test launch pad, where SpaceX has its administrative offices, and more and more cars line the sand dunes across from the complex, and unless one travels often the 21-mile stretch of Highway 4, also known as Boca Chica Highway, that leads to the beach and passes by the SpaceX complex, it might not be so noticeable.',
 'However, the regulatory agency that is was supposed to be guiding this environmental impact process, the FAA, allowed SpaceX.”Five years after the FAA issued its F

# Convert sentences from spacy to strings and join all sentences.

In [49]:
                                                                                #Tokenize the sentences
sentences = [sent.text for sent in doc.sents]

                                                                                #Join all sentences into a single string
joined_sentences = ' '.join(sentences)

                                                                                #Print the joined sentences
print(joined_sentences)


Cape Canaveral this is not. But here, down toward the coast, on a spit of land past the Border Patrol checkpoint, where the Rio Grande meets the Gulf of Mexico, there is a spaceship being assembled off State Highway 4 just before it dead-ends into the sea. Towering and stainless-steel shiny, it looks like a surreal sculpture amid the cactuses, yucca and relentless South Texas sun. And, because it’s being built not in a factory but out in the open, it’s become a roadside attraction, drawing gawkers to an area so remote that the county trucks in drinking water once a month to the few who live nearby. They’re coming to see Elon Musk’s latest creation, a prototype called Starship that he hopes will one day carry people by the dozens to the moon and Mars. Musk, in a presentation here Saturday, said his goal of building a “rapidly reusable spacecraft” here would lead to the fulfillment of his ultimate goal of creating “a city on Mars.” But first, he’ll need to pull off another improbable fea

# Summary for the converted spacy outputs to strings.

In [50]:
summary=' '.join(summarized_sentences)                                          #We have already tokenized and converted spacy sentences to strings
                                                                                #and generated sentence score above so using the above code I am just summarizing the output in sentences
summary

'The metamorphosis of this facility, which sits on tender tidal flats and feet from sand dunes where sea turtles lay eggs, has largely gone unnoticed and under the radar in this border community where environmentalists say they are strapped between fighting the construction of Donald Trump’s border wall through the region, and the development of three new liquefied natural gas facilities at the deepwater Port of Brownsville just 5 miles from SpaceX. More hangars and buildings have been built 1.5 miles from the test launch pad, where SpaceX has its administrative offices, and more and more cars line the sand dunes across from the complex, and unless one travels often the 21-mile stretch of Highway 4, also known as Boca Chica Highway, that leads to the beach and passes by the SpaceX complex, it might not be so noticeable. However, the regulatory agency that is was supposed to be guiding this environmental impact process, the FAA, allowed SpaceX.”Five years after the FAA issued its Final 

# Determine the Length of summary

In [51]:
                                                                                #Determine the length of the summary
summary_length_characters = len(summary)                                        # Length in characters
summary_length_words = len(summary.split())                                     # Length in words

                                                                                # Print the lengths
print("Summary length in characters:", summary_length_characters)
print("Summary length in words:", summary_length_words)

Summary length in characters: 2538
Summary length in words: 405


# Determining the length of original text.

In [52]:
                                                                                # Get the original text from the spacy doc
original_text = doc.text
                                                                                # Determine the length of the original text
original_length_characters = len(original_text)                                 # Length in characters
original_length_words = len(original_text.split())                              # Length in words

                                                                                # Print the lengths
print("Original text length in characters:", original_length_characters)
print("Original text length in words:", original_length_words)


Original text length in characters: 121215
Original text length in words: 20204


# Summarize the text using Spacy and TextRank

In [53]:
!pip install pytextrank




In [54]:
import spacy
import pytextrank
                                                                                # Load the spaCy model and add the TextRank pipeline component
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('textrank')
                                                                                # Load and process the text file
file_path = '/content/drive/MyDrive/NLP/WEEK5/spaceX_DP.txt'
with open(file_path, 'r') as file:
    text = file.read()
                                                                                # Process the text using the spaCy model with TextRank
doc = nlp(text)
                                                                                # Extract the top 5 ranked sentences for the summary
summary_sentences = []
for sent in doc._.textrank.summary(limit_phrases=5, limit_sentences=5):
    summary_sentences.append(sent.text)
                                                                                # Print the summary in 5 sentences
summary = ' '.join(summary_sentences)
print("Summary in 5 sentences:\n", summary)

Summary in 5 sentences:
 But first, he’ll need to pull off another improbable feat, building a private, commercial spaceport here, in what the top local elected official called a “mind-boggling” juxtaposition: SpaceX, one of the hottest companies in the world, led by a Silicon Valley celebrity with nearly 29 million Twitter followers, building a rocket in a border town where nearly a third of the residents live below the poverty line. Five years ago, SpaceX started building a launchpad here, hauling in dirt by the ton, that would allow the company a measure of freedom without the restraints that come with shooting rockets off from government sites, such as Cape Canaveral or Vandenberg Air Force Base in California, where several other companies operate. A handful of residents who live next door to SpaceX’s facilities recently received letters from SpaceX, which said the company’s footprint in the area was going to be bigger and more disruptive than originally imagined. The letter from S

# Question 2: Sentimental Analysis

# Installing and importing the required libraries for Sentimental Analysis.

In [55]:
!pip install vaderSentiment



In [56]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

                                                                                # Initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

                                                                                # Load and process the text file
file_path = '/content/drive/MyDrive/NLP/WEEK5/SpaceX.txt'

with open(file_path, 'r') as file:
    text = file.read()

                                                                                # Perform sentiment analysis
sentiment_scores = analyzer.polarity_scores(text)

                                                                                # Extract individual sentiment metrics
compound = sentiment_scores['compound']
negative = sentiment_scores['neg']
neutral = sentiment_scores['neu']
positive = sentiment_scores['pos']

                                                                                # Determine subjectivity based on neutral score
subjectivity = "subjective" if neutral < 0.5 else "objective"

                                                                                # Print sentiment metrics and subjectivity
print(f"Compound Score: {compound}")
print(f"Negative Score: {negative}")
print(f"Neutral Score: {neutral}")
print(f"Positive Score: {positive}")
print(f"The text is more {subjectivity}.")


Compound Score: 0.5999
Negative Score: 0.061
Neutral Score: 0.863
Positive Score: 0.076
The text is more objective.
