In [2]:
#practical 1

import re
import string
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords

# Download required resources for nltk
nltk.download('stopwords')
nltk.download('punkt')

# Function to fetch and clean raw HTML from a URL
def fetch_and_clean_html(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Remove script and style elements
    for script in soup(['script', 'style']):
        script.extract()

    # Get text and remove extra whitespace
    text = soup.get_text()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to normalize text
def normalize_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

# Function to tokenize and remove stopwords
def tokenize_and_remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

# Function for feature extraction using TF-IDF
def extract_features(texts):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix, vectorizer.get_feature_names_out()

# Example usage
if __name__ == "__main__":
    url = 'https://en.wikipedia.org/wiki/Web_mining'  # Example web page
    raw_text = fetch_and_clean_html(url)
    normalized_text = normalize_text(raw_text)
    tokens = tokenize_and_remove_stopwords(normalized_text)
    processed_text = ' '.join(tokens)

    tfidf_matrix, feature_names = extract_features([processed_text])

    print("Top 10 Features:")
    print(feature_names[:10])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sidda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sidda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Top 10 Features:
['aahc' 'ability' 'able' 'academic' 'acceleration' 'acceptable' 'access'
 'accessibility' 'according' 'account']


In [4]:
#Pract 2
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Sample user-item rating matrix (rows: users, columns: items)
data = {
    'Item1': [5, 4, np.nan, 1],
    'Item2': [3, np.nan, np.nan, 1],
    'Item3': [4, 2, 5, np.nan],
    'Item4': [np.nan, 3, 4, 2]
}

df = pd.DataFrame(data, index=['User1', 'User2', 'User3', 'User4'])

print("Original Ratings Matrix (with NaNs):")
print(df)

# Step 1: Fill missing values with user mean (mean imputation)
df_filled = df.apply(lambda row: row.fillna(row.mean()), axis=1)

# Step 2: Compute user-user cosine similarity
similarity = cosine_similarity(df_filled)
similarity_df = pd.DataFrame(similarity, index=df.index, columns=df.index)

print("\nUser-User Similarity Matrix:")
print(similarity_df.round(2))

# Step 3: Predict missing ratings using weighted average of neighbors
def predict_rating(user, item):
    if not np.isnan(df.loc[user, item]):
        return df.loc[user, item]  # return actual rating if it exists

    # Users who have rated this item
    users_who_rated = df[item].dropna().index
    sims = similarity_df.loc[user, users_who_rated]
    ratings = df.loc[users_who_rated, item]

    if sims.sum() == 0:
        return df.loc[user].mean()  # fallback: user's average

    # Weighted average
    weighted_sum = np.dot(sims, ratings)
    return weighted_sum / sims.sum()

# Step 4: Create a predicted rating matrix
predicted_df = df.copy()
for user in df.index:
    for item in df.columns:
        predicted_df.loc[user, item] = predict_rating(user, item)

print("\nPredicted Ratings Matrix (NaNs filled):")
print(predicted_df.round(2))


Original Ratings Matrix (with NaNs):
       Item1  Item2  Item3  Item4
User1    5.0    3.0    4.0    NaN
User2    4.0    NaN    2.0    3.0
User3    NaN    NaN    5.0    4.0
User4    1.0    1.0    NaN    2.0

User-User Similarity Matrix:
       User1  User2  User3  User4
User1   1.00   0.98   0.98   0.94
User2   0.98   1.00   0.96   0.91
User3   0.98   0.96   1.00   0.94
User4   0.94   0.91   0.94   1.00

Predicted Ratings Matrix (NaNs filled):
       Item1  Item2  Item3  Item4
User1   5.00   3.00   4.00   3.01
User2   4.00   2.04   2.00   3.00
User3   3.36   2.02   5.00   4.00
User4   1.00   1.00   3.68   2.00


In [5]:
#pract 3
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
from collections import defaultdict, Counter
import heapq

# Basic crawler
def crawl(start_url, depth=1):
    visited = set()
    queue = [(start_url, 0)]
    documents = {}

    while queue:
        url, cur_depth = queue.pop(0)
        if url in visited or cur_depth > depth:
            continue

        try:
            response = requests.get(url, timeout=5)
            soup = BeautifulSoup(response.text, 'html.parser')
            text = soup.get_text()
            documents[url] = clean_text(text)
            visited.add(url)

            if cur_depth < depth:
                for link in soup.find_all('a', href=True):
                    abs_url = urljoin(url, link['href'])
                    if urlparse(abs_url).scheme in ['http', 'https']:
                        queue.append((abs_url, cur_depth + 1))

        except Exception as e:
            print(f"Failed to crawl {url}: {e}")
            continue

    return documents

# Clean and tokenize text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower().split()

# Build inverted index
def build_inverted_index(documents):
    index = defaultdict(list)
    for url, words in documents.items():
        word_counts = Counter(words)
        for word, count in word_counts.items():
            index[word].append((url, count))
    return index

# Search function (ranking by frequency)
def search(query, index):
    query = query.lower().split()
    scores = defaultdict(int)

    for word in query:
        for url, freq in index.get(word, []):
            scores[url] += freq

    ranked = heapq.nlargest(5, scores.items(), key=lambda x: x[1])
    return ranked

# Example usage
if __name__ == "__main__":
    start_url = "https://en.wikipedia.org/wiki/Web_search_engine"
    print("🔍 Crawling the web...")
    docs = crawl(start_url, depth=1)

    print("\n⚙️ Building index...")
    index = build_inverted_index(docs)

    while True:
        query = input("\nEnter search query (or type 'exit'): ")
        if query.lower() == 'exit':
            break
        results = search(query, index)
        if results:
            print("\nTop results:")
            for url, score in results:
                print(f"{url} (score: {score})")
        else:
            print("No results found.")


🔍 Crawling the web...



KeyboardInterrupt



In [6]:
#pract 4
import networkx as nx

def pagerank(graph, damping=0.85, max_iter=100, tol=1.0e-6):
    if len(graph) == 0:
        return {}

    nodes = list(graph.keys())
    N = len(nodes)
    ranks = {node: 1.0 / N for node in nodes}

    # Build adjacency list
    outlinks = {node: set(graph[node]) for node in nodes}
    for node in nodes:
        outlinks[node] = set(outlinks[node]) & set(nodes)

    for iteration in range(max_iter):
        new_ranks = {}
        for node in nodes:
            rank_sum = 0.0
            for src in nodes:
                if node in outlinks[src]:
                    rank_sum += ranks[src] / len(outlinks[src]) if len(outlinks[src]) > 0 else 0
                elif len(outlinks[src]) == 0:
                    rank_sum += ranks[src] / N  # Dangling node contribution

            new_ranks[node] = (1 - damping) / N + damping * rank_sum

        # Check for convergence
        diff = sum(abs(new_ranks[node] - ranks[node]) for node in nodes)
        ranks = new_ranks
        if diff < tol:
            break

    return ranks

# Example Graphs
example_graph = {
    'A': ['B'],
    'B': ['C'],
    'C': ['A'],
    'D': []  # Dangling node
}

empty_graph = {}

# Run PageRank
print("📊 PageRank for Example Graph (with circular & dangling nodes):")
ranks = pagerank(example_graph)
for node, score in ranks.items():
    print(f"{node}: {score:.4f}")

print("\n📊 PageRank for Empty Graph:")
print(pagerank(empty_graph))



📊 PageRank for Example Graph (with circular & dangling nodes):
A: 0.3175
B: 0.3175
C: 0.3175
D: 0.0476

📊 PageRank for Empty Graph:
{}
