In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from cassandrautils import *


#Data Retrieval and Preprocessing

In [None]:
#Get record from Cassandra
weather = getWeatherDF()

# Data Preprocessing
weather['forecast_timestamp'] = pd.to_datetime(weather['forecastdate'])
weather['sunrise'] = pd.to_datetime(weather['sunrise'], unit='s')
weather['sunset'] = pd.to_datetime(weather['sunset'], unit='s')
print("Weather from ", weather['forecast_timestamp'].min(), " To ", weather['forecast_timestamp'].max())

In [11]:
weather

Unnamed: 0,location,forecastdate,description,feels_like,humidity,pressure,sunrise,sunset,temp,temp_max,temp_min,wind,forecast_timestamp
0,Vancouver,2021-06-05 05:01:07,overcast clouds,285.73999,87.0,1014.0,1622808588,1622866336,286.119995,288.75,284.609985,0.45,2021-06-05 05:01:07
1,Vancouver,2021-06-05 05:06:08,overcast clouds,285.73999,87.0,1014.0,1622808588,1622866336,286.119995,288.75,284.609985,0.45,2021-06-05 05:06:08
2,Vancouver,2021-06-05 05:11:08,overcast clouds,285.51001,87.0,1014.0,1622808588,1622866336,285.910004,288.070007,284.390015,0.89,2021-06-05 05:11:08
3,Vancouver,2021-06-05 05:16:09,overcast clouds,285.51001,87.0,1014.0,1622808588,1622866336,285.910004,288.070007,284.390015,0.89,2021-06-05 05:16:09
4,Vancouver,2021-06-05 05:21:09,overcast clouds,285.390015,88.0,1014.0,1622808588,1622866336,285.769989,288.279999,284.119995,0.45,2021-06-05 05:21:09
5,Vancouver,2021-06-05 05:26:09,overcast clouds,285.390015,88.0,1014.0,1622808588,1622866336,285.769989,288.279999,284.119995,0.45,2021-06-05 05:26:09
6,Vancouver,2021-06-05 05:31:11,overcast clouds,285.350006,88.0,1014.0,1622808588,1622866336,285.73999,288.100006,284.119995,0.89,2021-06-05 05:31:11
7,Vancouver,2021-06-05 05:36:12,overcast clouds,285.350006,88.0,1014.0,1622808588,1622866336,285.73999,288.100006,284.119995,0.89,2021-06-05 05:36:12
8,Vancouver,2021-06-05 05:41:12,overcast clouds,285.290009,88.0,1013.0,1622808588,1622866336,285.679993,288.100006,284.119995,0.89,2021-06-05 05:41:12
9,Vancouver,2021-06-05 05:46:12,overcast clouds,285.290009,88.0,1013.0,1622808588,1622866336,285.679993,288.100006,284.119995,0.89,2021-06-05 05:46:12


#Plot Weather Record for continuous observation

In [None]:
#Filter data by attribute "location"
def filter_by_location(df, location="Tokyo"):
    return df[df["location"] == location]

#For Tokyo
weatherTokyo = filter_by_location(weather, "Tokyo")
#For New York
weatherNewYork = filter_by_location(weather, "New York")

In [None]:
#For Tokyo
print("Total records: ", len(weatherTokyo))
plt.figure(figsize=(20,6))
plt.plot(weatherTokyo['forecast_timestamp'], weatherTokyo['temp'])
plt.xlabel("Time")
plt.ylabel("Temperature")
plt.title("Tokyo Weather")

In [None]:
#For New York
print("Total records: ", len(weatherNewYork))
plt.figure(figsize=(20,6))
plt.plot(weatherNewYork['forecast_timestamp'], weatherNewYork['temp'])
plt.xlabel("Time")
plt.ylabel("Temperature")
plt.title("New York Weather")

#Heatmap for deeper investigation

In [None]:
# Select relevant attributes
numeric_columns = ['temp', 'humidity', 'pressure', 'wind']

In [None]:
#Tokyo
# Compute correlation matrix
corr_matrix = weatherTokyo[numeric_columns].corr()

# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap of Tokyo')
plt.show()

In [None]:
#New York
# Compute correlation matrix
corr_matrix = weatherNewYork[numeric_columns].corr()

# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap of New York')
plt.show()

FAKER DATA VISUALIZATION

In [None]:
from cassandrautils import getFakerDF

fakerData = getFakerDF()
fakerData['job_history'] = fakerData['job_history'].apply(lambda x: eval(x) if isinstance(x, str) else x)

In [None]:
type(fakerData["job_history"][53])

In [None]:
#Need to turn the special type into a usable dict type for further analysis
def to_dict(x):
    try:
        return dict(x)
    except Exception:
        return {}

# Apply to the job_history column in your DataFrame
fakerData['job_history_dict'] = fakerData['job_history'].apply(to_dict)

In [None]:
fakerData

In [None]:
# Calculate total years worked for each person
fakerData['total_years_worked'] = fakerData['job_history_dict'].apply(lambda jobs: sum(jobs.values()))
plt.figure(figsize=(8,5))
plt.hist(fakerData['total_years_worked'], bins=range(0, 31, 2), edgecolor='black')
plt.title('Distribution of Total Years Worked')
plt.xlabel('Total Years Worked')
plt.ylabel('Number of People')
plt.grid(axis='y')
plt.show()

In [None]:
#Jobs per person
fakerData['num_jobs'] = fakerData['job_history_dict'].apply(len)
fakerData['num_jobs'].value_counts().sort_index().plot(kind='bar')
plt.title('Number of Jobs per Person')
plt.xlabel('Jobs')
plt.ylabel('People')
plt.show()

#For market research about current the relationship of experience and age

In [None]:
# Group by year_of_birth and calculate the average total years worked
average_exp_per_birth_year = (
    fakerData
    .groupby('year_of_birth')['total_years_worked']
    .mean()
    .reset_index()
)

plt.figure(figsize=(10, 5))
plt.plot(average_exp_per_birth_year['year_of_birth'],
         average_exp_per_birth_year['total_years_worked'],
         marker='o')
plt.xlabel('Year of Birth')
plt.ylabel('Average Total Years Worked')
plt.title('Average Work Experience by Year of Birth')
plt.grid(True)
plt.show()


#Interesting list for headhunters

In [None]:
#Number of people working at a certain place for more than (years)
year_threshold = 9  # Or any number you want

def has_long_job(job_history, threshold):
    return any(years > threshold for years in job_history.values())

# Filter the DataFrame
long_workers = fakerData[fakerData['job_history_dict'].apply(lambda jobs: has_long_job(jobs, year_threshold))]

# Show result
long_workers_list = long_workers[['name', 'job_history_dict']]
display(long_workers_list)
print(f"Number of people who have worked at any place for more than {year_threshold} years:", len(long_workers_list))


In [None]:
#Print in long list, will not run now due to being lengthy
#Print out list of all people above
for idx, row in long_workers.iterrows():
    print(f"{row['name']}:")
    for company, years in row['job_history'].items():
        if years > year_threshold:
            print(f"  - {company}: {years} years")
    print()

JOKE ANALYSIS FOR TOKEN COUNT LLM (PURELY DEMONSTRATION, NO REAL LLM's TECHNIQUE OR OFFICIAL TOKENIZATION WERE USED)

In [None]:
from cassandrautils import getJokeDF
joke_df = getJokeDF()
joke_df = joke_df.dropna(subset=['joke', 'setup', 'delivery'], how='all')  # Drop rows missing all text fields

In [None]:
def count_tokens(text):
    if not text:
        return 0
    return len(text.split())

In [None]:
#Caculate token counts for each joke
def total_joke_tokens(row):
    if row['type'] == 'single':
        return count_tokens(row['joke'])
    else:
        return count_tokens(row['setup']) + count_tokens(row['delivery'])

joke_df['token_count'] = joke_df.apply(total_joke_tokens, axis=1)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,5))
plt.hist(joke_df['token_count'], bins=range(0, max(joke_df['token_count'])+5, 2), edgecolor='black')
plt.title("Joke Token Count Distribution")
plt.xlabel("Token count (words per joke)")
plt.ylabel("Number of jokes")
plt.grid(axis='y')
plt.show()
