## Assignment 9.2
Author: Rex Gayas
Date: 11 February 2024
Modified By: N/A
Description: Exploring data extraction and manipulation and applying different data storage formats for efficient data handling and analysis.

In [22]:
# Task 1
import numpy as np
import pandas as pd

# Seed the random generator 
np.random.seed(0)

# Generate a 3x4 NumPy array with random values
random_array = np.random.rand(3, 4)

# Save the array as a CSV file named np.csv
np.savetxt("np.csv", random_array, delimiter=",")

# Create a DataFrame from the CSV file
df_from_csv = pd.read_csv("np.csv", header=None)

# Print the results of the DataFrame
print(df_from_csv)

# Write the DataFrame to a new CSV file
df_from_csv.to_csv("new_np.csv", index=False)


          0         1         2         3
0  0.548814  0.715189  0.602763  0.544883
1  0.423655  0.645894  0.437587  0.891773
2  0.963663  0.383442  0.791725  0.528895


In [23]:
# Tasks 2 & 3
import numpy as np
import pandas as pd
import os

# Generate a 365x4 NumPy array with random values
np.random.seed(0)
random_array_large = np.random.rand(365, 4)

# Store the large array in a CSV file and check its size
np.savetxt("large_np.csv", random_array_large, delimiter=",")

# Save the large array in the NumPy binary format (.npy)
np.save("large_np.npy", random_array_large)

# Load the array from the .npy file
loaded_array = np.load("large_np.npy")

# Create a DataFrame from this array
df_from_npy = pd.DataFrame(loaded_array)

# Write the DataFrame to a pickle file
df_from_npy.to_pickle("df_from_npy.pkl")

# Retrieve the DataFrame from the pickle file
retrieved_df = pd.read_pickle("df_from_npy.pkl")

# Print the size of the pickle file and the retrieved DataFrame
pickle_file_size = os.path.getsize("df_from_npy.pkl")
print("Size of the pickle file:", pickle_file_size)
print("Shape of the retrieved DataFrame:", retrieved_df.shape)

# Create a DataFrame from the large array and write it to an Excel file
df_from_npy.to_excel("large_data.xlsx", index=False)

# Read the DataFrame back from the Excel file
df_from_excel = pd.read_excel("large_data.xlsx")

# Print the results of the DataFrame from Excel
print(df_from_excel)


Size of the pickle file: 12239
Shape of the retrieved DataFrame: (365, 4)
            0         1         2         3
0    0.548814  0.715189  0.602763  0.544883
1    0.423655  0.645894  0.437587  0.891773
2    0.963663  0.383442  0.791725  0.528895
3    0.568045  0.925597  0.071036  0.087129
4    0.020218  0.832620  0.778157  0.870012
..        ...       ...       ...       ...
360  0.489685  0.131687  0.397014  0.704402
361  0.284886  0.103988  0.907898  0.709051
362  0.615276  0.792499  0.835646  0.483459
363  0.881188  0.916419  0.271551  0.607545
364  0.526584  0.537946  0.937663  0.305189

[365 rows x 4 columns]


In [24]:
# Tasks 4 & 5
import pandas as pd
import json
from io import StringIO

# Given JSON string
json_str = '{"country":"Netherlands","dma_code":"0","timezone":"Europe/Amsterdam","area_code":"0","ip":"46.19.37.108","asn":"AS196752","continent_code":"EU","isp":"Tilaa V.O.F.","longitude":5.75,"latitude":52.5,"country_code":"NL","country_code3":"NLD"}'

# Wrap the JSON string in a StringIO object 
json_data_io = StringIO(json_str)

# Parse JSON string with loads() function to create a dictionary
data = json.loads(json_str)

# Create a DataFrame from the parsed JSON
df = pd.DataFrame([data])

# Print the values for the “Country” column
print("Original country value:")
print(df['country'])

# Overwrite the value for Netherlands with 'Philippines'
df.at[0, 'country'] = 'Philippines'

# Print the modified DataFrame
print("\nModified DataFrame with new country value:")
print(df)

# Convert the DataFrame to a JSON string
json_result = df.to_json(orient='records')

# Print the JSON string
print("\nModified JSON string with new country value:")
print(json_result)

# Wrap the JSON string in a StringIO object before passing it to read_json()
json_result_io = StringIO(json_result)

# Use the Pandas read_json() function to create a Series from the JSON string
series = pd.read_json(json_result_io, typ='series')

# Change the country value again 
series['country'] = 'England'

# Convert the Pandas Series to a JSON string
json_series_result = series.to_json()

# Print the modified JSON string
print("\nModified JSON string after changing the country value again:")
print(json_series_result)





Original country value:
0    Netherlands
Name: country, dtype: object

Modified DataFrame with new country value:
       country dma_code          timezone area_code            ip       asn  \
0  Philippines        0  Europe/Amsterdam         0  46.19.37.108  AS196752   

  continent_code           isp  longitude  latitude country_code country_code3  
0             EU  Tilaa V.O.F.       5.75      52.5           NL           NLD  

Modified JSON string with new country value:
[{"country":"Philippines","dma_code":"0","timezone":"Europe\/Amsterdam","area_code":"0","ip":"46.19.37.108","asn":"AS196752","continent_code":"EU","isp":"Tilaa V.O.F.","longitude":5.75,"latitude":52.5,"country_code":"NL","country_code3":"NLD"}]

Modified JSON string after changing the country value again:
{"0":{"country":"Philippines","dma_code":"0","timezone":"Europe\/Amsterdam","area_code":"0","ip":"46.19.37.108","asn":"AS196752","continent_code":"EU","isp":"Tilaa V.O.F.","longitude":5.75,"latitude":52.5,"countr

In [25]:
# Task 6
from bs4 import BeautifulSoup
import requests
import re

# GitHub link to the loremIpsum.html file
url = "https://raw.githubusercontent.com/Rexsophy/Python-Data-Analysis/master/Chapter05/loremIpsum.html"

# Use requests to get the content of the HTML page from the GitHub link
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'lxml')

    # First div element and its class
    first_div = soup.find('div')
    print("First div:", first_div)
    print("First div class:", first_div.get('class') if first_div else 'No class attribute')

    # First dfn text
    dfn_text = soup.dfn.text if soup.dfn else 'No dfn tag found'
    print("First dfn text:", dfn_text)

    # All hyperlinks with their text and URLs
    for link in soup.find_all('a'):
        print("Link text:", link.string, "URL:", link.get('href'))

    # Content of all div tags
    for i, div in enumerate(soup.find_all('div')):
        print(f"Div {i} contents:", div.contents)

    # Div with id="official" and its specific content
    official_div = soup.find("div", id="official")
    print("Official Version:", official_div.get_text(strip=True) if official_div else 'No official div')

    # Number of div elements with any class attribute
    print("# elements with class:", len(soup.find_all("div", class_=True)))

    # Number of div elements with class="title"
    title_class = soup.find_all("div", class_="title")
    print("# Title classes:", len(title_class))

    # Number of div elements with class containing "title" using regex
    divs_with_tile = soup.find_all("div", class_=re.compile(r"\btitle\b"))
    print("# Divs with class containing title:", len(divs_with_tile))

    # CSS selector to find divs with class "notitle"
    print("Using CSS selector for notitle:", soup.select('div.notitle'))

    # The first two list items in an ordered list
    print("Selecting ordered list first list items:", soup.select("ol > li")[:2])

    # The second list item in an ordered list using CSS selector
    print("Second list item in ordered list:", soup.select_one("ol > li:nth-of-type(2)").get_text(strip=True))

    # Text nodes containing "2014"
    print("Searching for text string '2014':", soup.find_all(string=re.compile("2014")))
else:
    print(f"Failed to fetch HTML content from GitHub. Status code: {response.status_code}")


First div: <div class="tile">
<h4>Development</h4>
     0.10.1 - July 2014<br/>
</div>
First div class: ['tile']
First dfn text: Quare attende, quaeso.
Link text: loripsum.net URL: http://loripsum.net/
Link text: Poterat autem inpune; URL: http://loripsum.net/
Link text: Is es profecto tu. URL: http://loripsum.net/
Div 0 contents: ['\n', <h4>Development</h4>, '\n     0.10.1 - July 2014', <br/>, '\n']
Div 1 contents: ['\n', <h4>Official Release</h4>, '\n     0.10.0 June 2014', <br/>, '\n']
Div 2 contents: ['\n', <h4>Previous Release</h4>, '\n     0.09.1 June 2013', <br/>, '\n']
Official Version: Official Release0.10.0 June 2014
# elements with class: 3
# Title classes: 0
# Divs with class containing title: 0
Using CSS selector for notitle: []
Selecting ordered list first list items: [<li>Cur id non ita fit?</li>, <li>In qua si nihil est praeter rationem, sit in una virtute finis bonorum;</li>]
Second list item in ordered list: In qua si nihil est praeter rationem, sit in una virtute fin