In [12]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import string

# Ensure the necessary NLTK packages are downloaded
nltk.download('punkt')

def load_text_file(file_path):
    """Load text data from a file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        print("File not found. Please check the file path.")
        return None

def preprocess_text(text):
    """Preprocess text by removing punctuation and converting to lowercase."""
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator).lower()

def get_most_common_words(text, n=10):
    """Tokenize the text and return the n most common words."""
    tokens = word_tokenize(text)
    freq_dist = FreqDist(tokens)
    return freq_dist.most_common(n)

def main():
    # Specify the path to your text file
    file_path = "sample.txt"  # Replace with the path to your text file

    # Load and preprocess the text
    text = load_text_file(file_path)
    if text:
        processed_text = preprocess_text(text)

        # Get the 10 most common words
        most_common_words = get_most_common_words(processed_text, 10)

        print("The 10 most common words are:")
        for word, count in most_common_words:
            print(f"{word}: {count}")

if __name__ == "__main__":
    main()


File not found. Please check the file path.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rames\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(text1, text2):
    """Calculate cosine similarity between two strings."""
    # Initialize the TfidfVectorizer
    vectorizer = TfidfVectorizer()

    # Transform the texts into TF-IDF vectors
    tfidf_matrix = vectorizer.fit_transform([text1, text2])

    # Compute the cosine similarity
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

    return similarity[0][0]

def main():
    # Example strings
    text1 = "This is a sample sentence."
    text2 = "This sentence is a sample."

    # Calculate cosine similarity
    similarity = calculate_cosine_similarity(text1, text2)

    print(f"Cosine Similarity: {similarity}")

if __name__ == "__main__":
    main()


Cosine Similarity: 1.0


In [None]:
import spacy

def perform_ner(text):
    """Perform Named Entity Recognition (NER) on the given text."""
    # Load the SpaCy English model
    nlp = spacy.load("en_core_web_sm")

    # Process the text using SpaCy
    doc = nlp(text)

    # Extract entities and their labels
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

def main():
    # Example text
    text = "Barack Obama was born on August 4, 1961, in Honolulu, Hawaii. He was the 44th President of the United States."

    # Perform NER
    entities = perform_ner(text)

    print("Named Entities and their Types:")
    for entity, label in entities:
        print(f"{entity}: {label}")

if __name__ == "__main__":
    main()


In [18]:
pip install textblob


Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ------

In [16]:
from textblob import TextBlob

def analyze_sentiment(text):
    """Perform sentiment analysis on the given text."""
    # Create a TextBlob object
    blob = TextBlob(text)

    # Get the sentiment polarity
    polarity = blob.sentiment.polarity

    # Determine the sentiment type
    if polarity > 0:
        sentiment = "Positive"
    elif polarity < 0:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"

    return sentiment, polarity

def main():
    # Example text
    text = "I love this product! It's amazing and works perfectly."

    # Perform sentiment analysis
    sentiment, polarity = analyze_sentiment(text)

    print("Sentiment Analysis Result:")
    print(f"Sentiment: {sentiment}")
    print(f"Polarity: {polarity}")

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'textblob'

In [8]:
import string
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk

# Ensure necessary NLTK packages are downloaded
nltk.download('punkt')

def load_text_file(file_path):
    """Load text data from a file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        print("File not found. Please check the file path.")
        return None

def preprocess_text(text):
    """Preprocess text by removing punctuation and converting to lowercase."""
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator).lower()

def calculate_term_frequency(tokens):
    """Calculate the term frequency (TF) of each token."""
    total_tokens = len(tokens)
    token_counts = Counter(tokens)
    tf = {token: count / total_tokens for token, count in token_counts.items()}
    return tf

def main():
    # Specify the path to your text file
    file_path = "sample.txt"  # Replace with the path to your text file

    # Load and preprocess the text
    text = load_text_file(file_path)
    if text:
        processed_text = preprocess_text(text)

        # Tokenize the text
        tokens = word_tokenize(processed_text)

        # Calculate term frequency (TF)
        tf = calculate_term_frequency(tokens)

        # Sort and get the top 5 most frequent tokens
        top_tokens = sorted(tf.items(), key=lambda item: item[1], reverse=True)[:5]

        print("Top 5 Most Frequent Tokens and Their Term Frequencies:")
        for token, frequency in top_tokens:
            print(f"{token}: {frequency:.4f}")

if __name__ == "__main__":
    main()

File not found. Please check the file path.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rames\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
