### NLP Processing using the punch of text using the streamlit application

In [None]:
import nltk
import stanza
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import ne_chunk
from rake_nltk import Rake
from wordcloud import WordCloud

In [10]:
# # Download necessary resources for NLTK
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

In [11]:
# # # Initialize Stanza pipeline for POS tagging
# # stanza.download('en')
# nlp_stanza = stanza.Pipeline(lang='en', processors='tokenize,pos')

In [12]:

# Step 1: Read the text file
file_path = "C:/Sudhakar/Projects/Guvi Final Project/NLP/Dataset and Document/Scar lion king.txt"

with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()

print("Original Text (First 500 characters):\n", text[:500])

Original Text (First 500 characters):
 Scarface, a legendary alpha lion of the Maasai Mara National Reserve in Kenya, was known for his resilience, dominance, and survival skills, earning him fame among tourists and wildlife enthusiasts. He died of natural causes on June 11, 2021, at the age of 14, after a life marked by territorial battles and encounters with other predators. 
Key aspects of Scarface's story:
Dominance and Resilience:
Scarface was a dominant male lion, known for his ability to survive and thrive despite facing numer


In [13]:
# Step 2: NLP Preprocessing

# Tokenization
tokens = word_tokenize(text)
print("\nTokenization Output:", tokens[:10])

# Lowercasing
tokens_lower = [word.lower() for word in tokens]
print("\nLowercased Tokens:", tokens_lower[:10])

# Stopword Removal
stop_words = set(stopwords.words('english'))
tokens_no_stopwords = [word for word in tokens_lower if word.isalpha() and word not in stop_words]
print("\nStopword Removal Output:", tokens_no_stopwords[:10])

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in tokens_no_stopwords]
print("\nStemming Output:", stemmed_tokens[:10])

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens_no_stopwords]
print("\nLemmatization Output:", lemmatized_tokens[:10])

# Part-of-Speech (POS) Tagging using Stanza
doc = nlp_stanza(" ".join(tokens_no_stopwords))
pos_tags = [(word.text, word.xpos) for sent in doc.sentences for word in sent.words]
print("\nPOS Tagging Output:", pos_tags[:20])

rake = Rake()
rake.extract_keywords_from_text(text)
keywords = rake.get_ranked_phrases()[:20]  # Get top 20 keywords
print("\nExtracted Keywords:", keywords)




Tokenization Output: ['Scarface', ',', 'a', 'legendary', 'alpha', 'lion', 'of', 'the', 'Maasai', 'Mara']

Lowercased Tokens: ['scarface', ',', 'a', 'legendary', 'alpha', 'lion', 'of', 'the', 'maasai', 'mara']

Stopword Removal Output: ['scarface', 'legendary', 'alpha', 'lion', 'maasai', 'mara', 'national', 'reserve', 'kenya', 'known']

Stemming Output: ['scarfac', 'legendari', 'alpha', 'lion', 'maasai', 'mara', 'nation', 'reserv', 'kenya', 'known']

Lemmatization Output: ['scarface', 'legendary', 'alpha', 'lion', 'maasai', 'mara', 'national', 'reserve', 'kenya', 'known']

POS Tagging Output: [('scarface', 'NN'), ('legendary', 'JJ'), ('alpha', 'NN'), ('lion', 'NN'), ('maasai', 'NN'), ('mara', 'NNP'), ('national', 'JJ'), ('reserve', 'NN'), ('kenya', 'NNP'), ('known', 'VBN'), ('resilience', 'NN'), ('dominance', 'NN'), ('survival', 'NN'), ('skills', 'NNS'), ('earning', 'VBG'), ('fame', 'NN'), ('among', 'IN'), ('tourists', 'NNS'), ('wildlife', 'NN'), ('enthusiasts', 'NNS')]

Extracted Keyw

In [14]:

processed_data = {
    "tokens": tokens,
    "tokens_lower": tokens_lower,
    "tokens_no_stopwords": tokens_no_stopwords,
    "stemmed_tokens": stemmed_tokens,
    "lemmatized_tokens": lemmatized_tokens,
    "pos_tags": pos_tags,
    "keywords": keywords
}

In [None]:
processed_data

In [16]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [17]:
# Download necessary NLTK resource for sentiment analysis
nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\vsudh\AppData\Roaming\nltk_data...


True

In [18]:
# Initialize VADER SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# # Input text (replace with your actual text variable)
# file_path = "C:/Sudhakar/Projects/Guvi Final Project/NLP/Dataset and Document/Scar lion king.txt"

# with open(file_path, "r", encoding="utf-8") as file:
#     text = file.read()

# Perform sentiment analysis
sentiment_scores = sia.polarity_scores(text)

In [19]:


# Determine overall sentiment label
if sentiment_scores["compound"] >= 0.05:
    sentiment_label = "Positive 😀"
elif sentiment_scores["compound"] <= -0.05:
    sentiment_label = "Negative 😠"
else:
    sentiment_label = "Neutral 😐"


In [20]:
# Print results
print("\nSentiment Analysis Scores:", sentiment_scores)
print("Overall Sentiment:", sentiment_label)


Sentiment Analysis Scores: {'neg': 0.091, 'neu': 0.743, 'pos': 0.167, 'compound': 0.9322}
Overall Sentiment: Positive 😀
