In [146]:
import requests
from bs4 import BeautifulSoup
import wikipedia

import pandas as pd
from IPython.display import display, clear_output

In [147]:
def convert_encoding(element):
    if isinstance(element, str):
        return element.encode('ISO-8859-1').decode('UTF-8')
    else:
        return element

def scrape_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', {'class': 'wikitable'})
    rows = table.find_all('tr')
    data = []
    for row in rows:
        cols = row.find_all('td')
        cols = [col.text.strip() for col in cols]
        data.append(cols)
    return data


def clean_dataframe(df):
    df = pd.read_csv('nobel.csv', encoding='ISO-8859-1')

    # Remove whitespaces
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    
    df = df.dropna()  # This line removes any rows with missing values

    # Convert data to appropriate types
    df['year'] = pd.to_datetime(df['year'], format='%Y', errors='coerce')

    df = df.applymap(convert_encoding)
    df = df.dropna(how='any')
    display(df)

    
    df = pd.read_csv('nobel.csv').dropna(how='all')
    df = df.fillna('No award')


    df.to_csv("nobel.csv", index=False)

    return df









In [148]:
url = 'https://en.wikipedia.org/wiki/List_of_Nobel_laureates'
df = pd.DataFrame(scrape_data(url), columns=['Year', 'Physics', 'Chemistry', 'Medicine', 'Literature', 'Peace', 'Economics'])
df = clean_dataframe(df)

Unnamed: 0,year,physics,chemistry,medicine,literature,peace,economics
0,1901-01-01,Wilhelm Röntgen,Jacobus Henricus van 't Hoff,Emil von Behring,Sully Prudhomme,Henry Dunant;Frédéric Passy,—
1,1902-01-01,Hendrik Lorentz;Pieter Zeeman,Emil Fischer,Ronald Ross,Theodor Mommsen,Élie Ducommun;Charles Albert Gobat,No award this year
2,1903-01-01,Henri Becquerel;Pierre Curie;Marie Curie,Svante Arrhenius,Niels Ryberg Finsen,Bjørnstjerne Bjørnson,Randal Cremer,No award this year
3,1904-01-01,Lord Rayleigh,William Ramsay,Ivan Pavlov,Frédéric Mistral;José Echegaray,Institut de Droit International,No award this year
4,1905-01-01,Philipp Lenard,Adolf von Baeyer,Robert Koch,Henryk Sienkiewicz,Bertha von Suttner,No award this year
5,1906-01-01,J. J. Thomson,Henri Moissan,Camillo Golgi;Santiago Ramón y Cajal,Giosuè Carducci,Theodore Roosevelt,No award this year
6,1907-01-01,Albert A. Michelson,Eduard Buchner,Charles Louis Alphonse Laveran,Rudyard Kipling,Ernesto Teodoro Moneta;Louis Renault,No award this year
7,1908-01-01,Gabriel Lippmann,Ernest Rutherford,Élie Metchnikoff;Paul Ehrlich,Rudolf Christoph Eucken,Klas Pontus Arnoldson;Fredrik Bajer,No award this year
8,1909-01-01,Karl Ferdinand Braun;Guglielmo Marconi,Wilhelm Ostwald,Emil Theodor Kocher,Selma Lagerlöf,Auguste Beernaert;Paul Henri Balluet d'Estourn...,No award this year
9,1910-01-01,Johannes Diderik van der Waals,Otto Wallach,Albrecht Kossel,Paul Heyse,International Peace Bureau,No award this year


In [161]:
def look_year_and_category(year, category):
    df_year = df[(df['year'] == year)]
    word_list = df_year[category].tolist()
    word_list = (word_list[0].split(';'))
    return word_list

def find_column_with_name(name):
    # Convert all columns to string and apply the mask
    mask = df.applymap(str).apply(lambda col: col.str.contains(name, na=False))
    
    # Find columns where 'name' is found
    columns_with_name = df.columns[mask.any(axis=0)]
    
    # Return the column names as a list
    return list(columns_with_name)


In [150]:

wikipedia.set_lang("en")  

# Access a specific page
page = wikipedia.page("List of Nobel laureates by country")

# Get the whole page content
content = page.content

try:
    page = wikipedia.page("List of Nobel laureates by country")
    content = page.content
    contentList = content.split("===")
    your_string = "This is the string you want to save as a text file."

    # Specify the filename
    filename = "country.txt"

    

    # Writing the string to the text file
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)

    print(f"Content saved to {filename}")

except wikipedia.exceptions.PageError:
    print("Page not found")
except wikipedia.exceptions.DisambiguationError as e:
    print("Disambiguation error. Possible options include:")
    print(e.options)


Content saved to country.txt


In [151]:
country = "United States"
contentListCountry = contentList

def set_countries_to_list(): 
    
    for i, content in enumerate(contentListCountry):
        if i % 2 != 0:  # If the index is uneven
            contentListCountry[i] = content.strip()  # Remove spaces at the beginning and end
            #print(contentListCountry[i])

    return contentListCountry

set_countries_to_list()

['This is a list of Nobel Prize laureates by country. Listings for Economics refer to the related Nobel Memorial Prize in Economic Sciences. The Nobel Prizes and the Prize in Economic Sciences have been awarded 567 times to 889 recipients, of which 26 awards (all Peace Prizes) were to organizations. Due to some recipients receiving multiple awards, the total number of recipients is 860 individuals and 22 organizations.The present list ranks laureates under the country/countries that are stated by the Nobel Prize committee on its website. The list does not distinguish between laureates who received a full prize and the majority who shared a prize. Some laureates are listed under more than one country, because the official website mentions multiple countries in relation to the laureate. If a country is merely mentioned as the place of birth, an asterisk (*) is used in the respective listing to indicate this. In this case, the birth country is mentioned in italics at the other listings of

In [152]:
def give_me_people_by_country(country):
    if country in contentListCountry:
        foundindex = contentListCountry.index(country)
        
        #print(contentList[foundindex+1])
        return contentList[foundindex+1]
    else:
        print("Country not found")

In [179]:
lista_de_personas = give_me_people_by_country("Canada")


In [154]:
def filterable(data_string):
    # The string containing the data
    data_string 

    # Remove parentheses and split the string into lines
    data_string = data_string.strip("()")
    lines = data_string.split('\n')

    # Parse each line and collect data
    data = []
    for line in lines:
        # Splitting each line by comma and stripping extra whitespace
        parts = [part.strip() for part in line.split(',')]
        if len(parts) == 3:  # Making sure the line is properly formatted
            data.append(parts)

    # Creating a DataFrame
    df = pd.DataFrame(data, columns=['Name', 'Prize', 'Year'])

    #display(df)
    return df

In [176]:
def find_final(country, category):
    lista_de_personas = give_me_people_by_country(country)
    df = filterable(lista_de_personas)
    df_category = df[df['Prize'] == category]

    if df_category.empty:
        print(f'No awards for {country} in {category}')
    else:
        pd.set_option('display.max_rows', None)  # Display all rows
       
        display(df_category[['Name','Prize']])
        
        

In [177]:
def wiki_nobel():

    category_dropdown = widgets.Dropdown(
        options=['Physics', 'Chemistry', 'Medicine', 'Literature', 'Peace', 'Economics'],
        
        description='Category:',
        disabled=False,
    )
    countries_list = set_countries_to_list()
    list_of_countries = []

    for i, content in enumerate(countries_list):
            if i % 2 != 0:  # If the index is uneven
                contentListCountry[i] = content.strip()  # Remove spaces at the beginning and end
                list_of_countries.append(contentListCountry[i])

    country_dropwdown = widgets.Dropdown(
        options=list_of_countries,
        
        description='Country:',
        disabled=False,
    )

    button = widgets.Button(description='Search Nobel!')

    display(category_dropdown,country_dropwdown,button)

    def on_button_clicked(b): 
        clear_output()
        display(category_dropdown,country_dropwdown,button)
        response = find_final(country_dropwdown.value, category_dropdown.value)
        

    button.on_click(on_button_clicked)

    #pd.set_option(display.max_rows, 200)   

In [None]:
wiki_nobel()

Dropdown(description='Category:', index=3, options=('Physics', 'Chemistry', 'Medicine', 'Literature', 'Peace',…

Dropdown(description='Country:', index=14, options=('Algeria', 'Argentina', 'Armenia', 'Australia', 'Austria',…

Button(description='Search Nobel!', style=ButtonStyle())

Unnamed: 0,Name,Prize
1,Gabriel García Márquez,Literature


In [166]:
print(look_year_and_category(1998, 'physics'))
print(find_column_with_name('Andrea Ghez'))

['Robert B. Laughlin', 'Horst Ludwig Störmer', 'Daniel C. Tsui']
[]
