In [2]:
#Kmeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
documents = [
"AI and machine learning are advancing rapidly.",
"The stock market is experiencing a surge in technology stocks.",
"Natural Language Processing (NLP) is a subset of AI.",
"Investors are focusing on AI startups.",
"Climate change affects weather patterns globally.",
"Renewable energy sources like solar and wind are vital."
]
# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
# Apply K-Means
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X)
# Display clusters
for i, label in enumerate(kmeans.labels_):
  print(f"Document {i + 1}: Cluster {label}")

Document 1: Cluster 0
Document 2: Cluster 0
Document 3: Cluster 0
Document 4: Cluster 0
Document 5: Cluster 0
Document 6: Cluster 1


In [4]:
#Named Entity Recognition (NER) System
import spacy
text = """
Elon Musk is the CEO of SpaceX and Tesla. He was born in South Africa and is now based in th
e United States.
"""
# Load spaCy model
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
# Extract named entities
print("Named Entities:")
for entity in doc.ents:
  print(f"{entity.text} - {entity.label_}")

Named Entities:
Elon Musk - PERSON
SpaceX - NORP
Tesla - ORG
South Africa - GPE
United States - GPE


In [5]:
#Web Scraping and Content Extraction
import requests
from bs4 import BeautifulSoup
url = "https://en.wikipedia.org/wiki/Web_scraping"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Extract title and main content
title = soup.find("h1").text
content = " ".join([p.text for p in soup.find_all("p")[:3]]) # First 3 paragraphs
print("Page Title:", title)
print("\nExtracted Content:")
print(content)

Page Title: Web scraping

Extracted Content:
Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites.[1] Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis.
 Scraping a web page involves fetching it and then extracting data from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Having fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadshe

In [6]:
#Text Similarity Checker
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
doc1 = "Artificial intelligence is transforming industries."
doc2 = "AI is revolutionizing the industrial world."
# Calculate similarity
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
print("Similarity Score:", similarity[0][0])

Similarity Score: 0.10163066979112656


In [7]:
#Topic Modeling with LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
documents = [
"AI is a growing field of technology.",
"Machine learning is part of AI.",
"Climate change impacts the environment.",
"Renewable energy sources are vital for sustainability."
]
# Convert text to count vectors
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
# Apply LDA
lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda.fit(X)
# Display topics
print("Topics:")
for i, topic in enumerate(lda.components_):
  words = [vectorizer.get_feature_names_out()[j] for j in topic.argsort()[-5:]]
  print(f"Topic {i + 1}: {', '.join(words)}")

Topics:
Topic 1: change, climate, impacts, environment, ai
Topic 2: sustainability, sources, energy, renewable, vital


In [8]:
#Image Caption Retrieval
import cv2
from difflib import get_close_matches
captions = {
"dog.jpg": "A dog sitting on the grass.",
"cat.jpg": "A cat lying on a couch.",
"car.jpg": "A car parked on the street."
}
image_to_search = "dog.jpg"
matches = get_close_matches(image_to_search, captions.keys(), n=1, cutoff=0.5)
caption = captions[matches[0]] if matches else "No matching caption found."
print("Image Caption:")
print(caption)

Image Caption:
A dog sitting on the grass.


In [10]:
#Implementation of PageRank Algorithm
import networkx as nx
# Create a directed graph
graph = nx.DiGraph()
graph.add_edges_from([(1, 2), (2, 3), (3, 1), (3, 4), (4, 2)])
# Calculate PageRank
pagerank = nx.pagerank(graph)
print("PageRank Scores:")
for node, score in pagerank.items():
  print(f"Node {node}: {score:.4f}")

PageRank Scores:
Node 1: 0.1736
Node 2: 0.3326
Node 3: 0.3202
Node 4: 0.1736


In [15]:
!pip install --upgrade snscrape





In [16]:
#Social Media Data Retrieval and Analysis
import snscrape.modules.twitter as sntwitter

# Function to scrape tweets based on a keyword
def scrape_tweets(keyword, count=10):
    try:
        # Create a generator for tweets containing the keyword
        tweets = sntwitter.TwitterSearchScraper(f'{keyword}').get_items()

        # Collect a specified number of tweets
        tweet_list = []
        for i, tweet in enumerate(tweets):
            if i >= count:
                break
            tweet_list.append({
                "username": tweet.user.username,
                "date": tweet.date,
                "content": tweet.content,
                "likes": tweet.likeCount,
                "retweets": tweet.retweetCount
            })

        # Display the tweets
        for idx, tweet in enumerate(tweet_list, start=1):
            print(f"Tweet {idx}:")
            print(f"User: {tweet['username']}")
            print(f"Date: {tweet['date']}")
            print(f"Content: {tweet['content']}")
            print(f"Likes: {tweet['likes']}, Retweets: {tweet['retweets']}")
            print("-" * 50)

    except Exception as e:
        print(f"Error: {e}")

# Input keyword and scrape tweets
if __name__ == "__main__":
    keyword = input("Enter a keyword to search for: ")
    scrape_tweets(keyword, count=5)


Enter a keyword to search for: David Miller


ERROR:snscrape.base:Error retrieving https://twitter.com/search?f=live&lang=en&q=David+Miller&src=spelling_expansion_revert_click: SSLError(MaxRetryError("HTTPSConnectionPool(host='twitter.com', port=443): Max retries exceeded with url: /search?f=live&lang=en&q=David+Miller&src=spelling_expansion_revert_click (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)')))"))
CRITICAL:snscrape.base:4 requests to https://twitter.com/search?f=live&lang=en&q=David+Miller&src=spelling_expansion_revert_click failed, giving up.
CRITICAL:snscrape.base:Errors: SSLError(MaxRetryError("HTTPSConnectionPool(host='twitter.com', port=443): Max retries exceeded with url: /search?f=live&lang=en&q=David+Miller&src=spelling_expansion_revert_click (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007

Error: 4 requests to https://twitter.com/search?f=live&lang=en&q=David+Miller&src=spelling_expansion_revert_click failed, giving up.


In [19]:
#Audio Transcription and Keyword Search
!pip install SpeechRecognition
import speech_recognition as sr
import re
# Initialize recognizer
recognizer = sr.Recognizer()
# Transcribe audio file
audio_file = "sample.wav" # Replace with actual file
with sr.AudioFile(audio_file) as source:
  audio = recognizer.record(source)
  text = recognizer.recognize_google(audio)
print("Transcribed Text:")
print(text)
# Search for keywords
keyword = "AI"
found = re.search(keyword, text, re.IGNORECASE)
print("Keyword Found" if found else "Keyword Not Found")

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.11.0-py2.py3-none-any.whl.metadata (28 kB)
Downloading SpeechRecognition-3.11.0-py2.py3-none-any.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.11.0


FileNotFoundError: [Errno 2] No such file or directory: 'sample.wav'

In [24]:
#Question-Answering System Using Knowledge Base
from difflib import get_close_matches
knowledge_base = {
"What is AI?": "AI stands for Artificial Intelligence, the simulation of human intelligence in machines.",
"What is Machine Learning?": "Machine Learning is a subset of AI focused on building models that learn from data.",
"What is NLP?": "NLP, or Natural Language Processing, deals with the interaction betweencomputers and human languages."
}
def answer_question(question):
  matches = get_close_matches(question, knowledge_base.keys(), n=1, cutoff=0.5)
  return knowledge_base[matches[0]] if matches else "I don't have an answer for that."
user_question = input("Ask a question: ")
response = answer_question(user_question)
print("Answer:")
print(response)

Ask a question: What is AI?
Answer:
AI stands for Artificial Intelligence, the simulation of human intelligence in machines.
