### Part 3)

Make an interactive notebook.

In addition to presenting the project slides, at the end of the presentation each student will demonstrate their code using a famous person suggested by the other students that exists in the DBpedia set.


In [2]:
!pip install ipywidgets
import ipywidgets as widgets
from IPython.display import display
import wikipedia
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors 
from sklearn.decomposition import TruncatedSVD 
import nltk 
nltk.download('punkt') 
nltk.download('averaged_perceptron_tagger') 
nltk.download('wordnet') 
nltk.download('stopwords')
from textblob import TextBlob, Word

def get_wikipedia_content(person_name):
    try:
        page = wikipedia.page(person_name)
        return page.content
    except wikipedia.exceptions.PageError:
        return None

def clean_text(text):
    text = text.lower()
    words = text.split()
    singularized_words = [Word(word).singularize() for word in words]
    return ' '.join(singularized_words)

def analyze_wikipedia_content(person_name, nearest_neighbors):
    main_content = get_wikipedia_content(person_name)
    
    if main_content is None:
        print(f"No Wikipedia page found for {person_name}")
        return None, None, None, None
    
    main_content = clean_text(main_content)
    
    neighbor_contents = []
    for neighbor in nearest_neighbors:
        content = get_wikipedia_content(neighbor)
        if content is not None:
            neighbor_contents.append(clean_text(content))
    
    if len(neighbor_contents) < 2:
        print("Not enough data to perform analysis. Skipping ranking.")
        return None, None, None, None
    
    bow_vectorizer = CountVectorizer(stop_words='english')
    bow_matrix = bow_vectorizer.fit_transform([main_content] + neighbor_contents)

    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform([main_content] + neighbor_contents)
    
    bow_similarity = cosine_similarity(bow_matrix[0].reshape(1, -1), bow_matrix[1:])
    tfidf_similarity = cosine_similarity(tfidf_matrix[0].reshape(1, -1), tfidf_matrix[1:])
    
    bow_ranking = np.argsort(bow_similarity[0])[::-1]
    tfidf_ranking = np.argsort(tfidf_similarity[0])[::-1]
    
    bow_wikipedia_ranking = [nearest_neighbors[i] for i in bow_ranking]
    tfidf_wikipedia_ranking = [nearest_neighbors[i] for i in tfidf_ranking]
    
    return main_content, bow_wikipedia_ranking, tfidf_wikipedia_ranking, tfidf_matrix

person_name_widget = widgets.Text(value="Albert Einstein", placeholder="Enter person name", description="Person:", disabled=False)
nearest_neighbors_widget = widgets.Textarea(value="Marie Curie\nIsaac Newton\nGalileo Galilei\nStephen Hawking\nRichard Feynman\nNikola Tesla\nCharles Darwin\nAristotle\nArchimedes\nLeonardo da Vinci", placeholder="Enter nearest neighbors (one per line)", description="Neighbors:", disabled=False)

def process_input(b):
    person_name = person_name_widget.value
    nearest_neighbors = nearest_neighbors_widget.value.split('\n')
    
    main_content, bow_wikipedia_ranking, tfidf_wikipedia_ranking, tfidf_matrix = analyze_wikipedia_content(person_name, nearest_neighbors)
    
    if main_content is not None:
        output = f"Sentiment of {person_name}'s Wikipedia page:\n"
        output += f"Polarity: {TextBlob(main_content).sentiment.polarity}\n"
        output += f"Subjectivity: {TextBlob(main_content).sentiment.subjectivity}\n\n"
        
        output += "BoW Ranking:\n"
        for i, neighbor in enumerate(bow_wikipedia_ranking):
            output += f"{i+1}. {neighbor}\n"
        
        output += "\nTF-IDF Ranking:\n"
        for i, neighbor in enumerate(tfidf_wikipedia_ranking):
            output += f"{i+1}. {neighbor}\n"
        
        output += "\nComparison of rankings:\n"
        for i in range(len(nearest_neighbors)):
            if nearest_neighbors[i] in tfidf_wikipedia_ranking:
                output += f"{nearest_neighbors[i]}: Original rank {i+1}, Wikipedia rank {tfidf_wikipedia_ranking.index(nearest_neighbors[i])+1}\n"
            else:
                output += f"{nearest_neighbors[i]}: Original rank {i+1}, Not found in Wikipedia ranking\n"
        
        output_widget.value = output
    else:
        output_widget.value = f"No Wikipedia page found for {person_name}. Unable to perform analysis."

analyze_button = widgets.Button(description="Analyze")
analyze_button.on_click(process_input)

output_widget = widgets.Textarea(value="", placeholder="Results will appear here", description="Results:", disabled=True)

display(person_name_widget)
display(nearest_neighbors_widget)
display(analyze_button)
display(output_widget)


[0m

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Text(value='Albert Einstein', description='Person:', placeholder='Enter person name')

Textarea(value='Marie Curie\nIsaac Newton\nGalileo Galilei\nStephen Hawking\nRichard Feynman\nNikola Tesla\nCh…

Button(description='Analyze', style=ButtonStyle())

Textarea(value='', description='Results:', disabled=True, placeholder='Results will appear here')