In [None]:
import requests
import concurrent.futures
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import statsmodels.formula.api as smf
from scipy.stats import ttest_ind

# 1. Fetch Sunrise-Sunset Data

In [None]:
def fetch_sun_data(date_str, lat=55.952061, lng=-3.196480, formatted=0):
    """
    Fetch sunrise-sunset data for a given date.
    Using 'formatted=0' returns ISO8601 timestamps (in UTC).
    """
    url = "https://api.sunrise-sunset.org/json"
    params = {
        "lat": lat,
        "lng": lng,
        "date": date_str,
        "formatted": formatted
    }
    try:
        response = requests.get(url, params=params)
        data = response.json()
        if data.get("status") == "OK":
            result = data["results"]
            result["date"] = date_str  # Add the date for merging later
            return result
        else:
            print(f"Error fetching data for {date_str}: {data}")
            return None
    except Exception as e:
        print(f"Exception for {date_str}: {e}")
        return None

start_date = datetime(2019, 4, 1)
end_date = datetime(2023, 12, 31)  

all_dates = []
current_date = start_date
while current_date <= end_date:
    all_dates.append(current_date.strftime("%Y-%m-%d"))
    current_date += timedelta(days=1)

# Fetch the API data concurrently
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    future_to_date = {executor.submit(fetch_sun_data, date): date for date in all_dates}
    for future in concurrent.futures.as_completed(future_to_date):
        res = future.result()
        if res is not None:
            results.append(res)

df_sun = pd.DataFrame(results)
print("Sunrise-Sunset API Data:")
print(df_sun.head())

# 2. Process Sunrise-Sunset Data

In [None]:
def convert_day_length_to_hours(day_length_val):
    """
    Convert day length to hours.
    If the value is a string formatted as "HH:MM:SS", split and convert it.
    If it's a numeric value, assume it's in seconds and convert to hours.
    """
    # If the value is numeric, assume it's in seconds.
    if isinstance(day_length_val, (int, float)):
        return float(day_length_val) / 3600.0

    # Otherwise, assume it's a string.
    try:
        # Check if the string contains colons (i.e., "HH:MM:SS")
        if ':' in day_length_val:
            parts = day_length_val.split(':')
            if len(parts) == 3:
                hours, minutes, seconds = map(int, parts)
                return hours + minutes/60 + seconds/3600
            else:
                raise ValueError("Expected format HH:MM:SS, got: " + str(day_length_val))
        else:
            # If there are no colons, assume it's seconds in string format.
            return float(day_length_val) / 3600.0
    except Exception as e:
        print("Error converting day_length:", day_length_val, e)
        return None

df_sun["daylight_hours"] = df_sun["day_length"].apply(convert_day_length_to_hours)

# Convert ISO8601 sunrise times (in UTC) to London time and compute numeric hour
df_sun["sunrise_dt_utc"] = pd.to_datetime(df_sun["sunrise"], utc=True)
df_sun["sunrise_london"] = df_sun["sunrise_dt_utc"].dt.tz_convert("Europe/London")
df_sun["sunrise_hour"] = df_sun["sunrise_london"].dt.hour + df_sun["sunrise_london"].dt.minute/60

# Create a Date column from the API date string
df_sun["Date"] = pd.to_datetime(df_sun["date"])

print("\nProcessed Sunrise-Sunset Data:")
print(df_sun[["date", "day_length", "daylight_hours", "sunrise_hour"]].head())

# 3. Energy Usage Data from Excel

In [None]:
df = pd.read_excel('HHDATA_318_0001.xlsx')
cutoff = 3137
df_energy = df.iloc[:cutoff]
# Convert "READING DATE" (assuming MM/DD/YY format; adjust if needed) to datetime
df_energy["Date"] = pd.to_datetime(df_energy["READING DATE"], format="%m/%d/%y")
print("\nEnergy Usage Data (Before Filtering):")
print(df_energy.head())


# Filtering data only some used some discarded 


In [None]:
active_periods = [
    (datetime(2019, 4 , 1), datetime(2019, 12, 31)),
    (datetime(2020, 1, 1), datetime(2020, 3, 24)),
    (datetime(2021, 9, 1), datetime(2023, 12, 31)),   
    
]

def is_active(date):
    """Return True if the date is within any active academic period."""
    return any(start <= date <= end for start, end in active_periods)

df_energy['is_active'] = df_energy['Date'].apply(is_active)
df_energy_active = df_energy[df_energy['is_active']].copy()

# Exclude Fringe Festival: Aug 4, 2023 – Aug 28, 2023
'''festival_start = pd.to_datetime("2023-08-04")
festival_end = pd.to_datetime("2023-08-28")
df_energy_active = df_energy_active[~((df_energy_active["Date"] >= spring_break_start) & 
                                       (df_energy_active["Date"] <= spring_break_end))]

spring_break_start = pd.to_datetime("2023-04-07")
spring_break_end = pd.to_datetime("2023-04-24")
df_energy_active = df_energy_active[~((df_energy_active["Date"] >= spring_break_start) & 
                                       (df_energy_active["Date"] <= spring_break_end)'''

print("\nEnergy Usage Data (Active Terms, Special Periods Excluded):")
print(df_energy_active.head())

# Remove outliers using IQR method on "Daily Total"
Q1 = df_energy_active["Daily Total"].quantile(0.25)
Q3 = df_energy_active["Daily Total"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_energy_active_clean = df_energy_active[(df_energy_active["Daily Total"] >= lower_bound) &
                                          (df_energy_active["Daily Total"] <= upper_bound)]
print("\nEnergy Usage Data (After Outlier Removal):")
print(df_energy_active_clean.head())

df_merged_energy = pd.merge(df_energy_active_clean, df_sun[["Date", "daylight_hours"]], on="Date", how="inner")
print("\nMerged Energy and Daylight Data:")
print(df_merged_energy.head())

# 4. Temperature Data

In [None]:
df_temp1 = pd.read_csv(
    'midas-open-daily-temperature-edinburgh-botanics/midas-open_uk-daily-temperature-obs_dv-202407_midlothian-in-lothian-region_00253_edinburgh-royal-botanic-garden-no-2_qcv-1_2023.csv',
    engine='python',
    skiprows=91
)
dp_temp2 =pd.read_csv(
    'midas-open-daily-temperature-edinburgh-botanics/midas-open_uk-daily-temperature-obs_dv-202407_midlothian-in-lothian-region_00253_edinburgh-royal-botanic-garden-no-2_qcv-1_2022.csv',
    engine='python',
    skiprows=91
)
df_temp3 =pd.read_csv(
    'midas-open-daily-temperature-edinburgh-botanics/midas-open_uk-daily-temperature-obs_dv-202407_midlothian-in-lothian-region_00253_edinburgh-royal-botanic-garden-no-2_qcv-1_2021.csv',
    engine='python',
    skiprows=91
)
df_temp4 =pd.read_csv(
    'midas-open-daily-temperature-edinburgh-botanics/midas-open_uk-daily-temperature-obs_dv-202407_midlothian-in-lothian-region_00253_edinburgh-royal-botanic-garden-no-2_qcv-1_2020.csv',
    engine='python',
    skiprows=91
)
df_temp7 = pd.read_csv(
    'midas-open-daily-temperature-edinburgh-botanics/midas-open_uk-daily-temperature-obs_dv-202407_midlothian-in-lothian-region_00253_edinburgh-royal-botanic-garden-no-2_qcv-1_2019.csv',
    engine='python',
    skiprows=91
)
df_temp5 = pd.concat([df_temp3, dp_temp4], ignore_index=True)
df_temp6 = pd.concat([df_temp1, dp_temp2], ignore_index=True)
df_temp8 =  pd.concat([df_temp5, df_temp6], ignore_index=True)
df_temp =  pd.concat([df_temp8, df_temp7], ignore_index=True)
df_temp['ob_end_time'] = pd.to_datetime(df_temp['ob_end_time'], format="%Y-%m-%d %H:%M:%S", errors='coerce')
df_temp = df_temp.dropna(subset=['ob_end_time'])
# Filter to keep only rows with observation time "09:00:00"
df_temp = df_temp[df_temp['ob_end_time'].dt.time == pd.to_datetime("09:00:00").time()]
# Compute mean temperature as the average of max_air_temp and min_air_temp
df_temp['mean_temp'] = (df_temp['max_air_temp'] + df_temp['min_air_temp']) / 2
# Create a Date column from ob_end_time
df_temp['Date'] = pd.to_datetime(df_temp['ob_end_time'].dt.date)
print("\nCleaned Temperature Data:")
print(df_temp[['ob_end_time', 'mean_temp', 'Date']].head())


In [None]:
df_final = pd.merge(df_merged_energy, df_temp[['Date', 'mean_temp']], on="Date", how="left")
print("\nFinal Merged Data:")
print(df_final.head())

# 5. Checking if data is in semester 

In [None]:
def is_semester(date):
    """Return 1 if the date falls within the semester periods:
       Jan 14 - Apr 05 or Sep 16 - Dec 22, else return 0.
    """
    m, d = date.month, date.day
    if (m == 1 and d >= 14) or (m in [2, 3]) or (m == 4 and d <= 5):
        return 1
    elif (m == 9 and d >= 16) or (m in [10, 11]) or (m == 12 and d <= 22):
        return 1
    else:
        return 0

df_final["semester"] = df_final["Date"].apply(is_semester)
df_final = df_final[df_final["Date"].dt.month != 8].copy()
print(df_final[["Date", "semester", "Daily Total"]].head(20))

# 6. Visulaizations

In [None]:
df_semester = df_final[df_final["semester"] == 1].copy()
df_nonsemester = df_final[df_final["semester"] == 0].copy()

# Define predictors and response (using daylight_hours and mean_temp)
# Semester Model
X_sem = df_semester[["daylight_hours", "mean_temp"]]
y_sem = df_semester["Daily Total"]
X_sem_const = sm.add_constant(X_sem)
model_sem = sm.OLS(y_sem, X_sem_const).fit()
print("\nSemester Time Regression Summary:")
print(model_sem.summary())



In [None]:
# Non-Semester Model
X_nonsem = df_nonsemester[["daylight_hours", "mean_temp"]]
y_nonsem = df_nonsemester["Daily Total"]
X_nonsem_const = sm.add_constant(X_nonsem)
model_nonsem = sm.OLS(y_nonsem, X_nonsem_const).fit()
print("\nNon-Semester Time Regression Summary:")
print(model_nonsem.summary())

In [None]:
df_general = df_final.copy()

In [None]:
# Plot 1: Daily Electricity Usage Over Time (Line Plot)
plt.figure(figsize=(12,6))
plt.plot(df_general["Date"], df_general["Daily Total"], marker="o", linestyle="-")
plt.xlabel("Date")
plt.ylabel("Daily Total Electricity Usage")
plt.title("Daily Electricity Usage Over Time")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Plot 2: Scatter Plot (Usage vs. Daylight Hours, Colored by Mean Temperature)
plt.figure(figsize=(10,6))
scatter = plt.scatter(df_general["daylight_hours"], df_general["Daily Total"],
                      c=df_general["mean_temp"], cmap="viridis", alpha=0.7)
plt.xlabel("Daylight Hours")
plt.ylabel("Daily Total Electricity Usage")
plt.title("Electricity Usage vs. Daylight Hours")
cbar = plt.colorbar(scatter)
cbar.set_label("Mean Temperature")
plt.show()

In [None]:
# Plot 3: Regression Visualization (Using all data, with temperature held at its mean)
# Re-fit an overall regression model using daylight_hours and mean_temp:
X_overall = df_general[["daylight_hours", "mean_temp"]]
y_overall = df_general["Daily Total"]
X_overall_const = sm.add_constant(X_overall)
model_overall = sm.OLS(y_overall, X_overall_const).fit()

# Create a range of daylight_hours for prediction
X_line = np.linspace(df_general["daylight_hours"].min(), 
                     df_general["daylight_hours"].max(), 100)
mean_temp_val = df_general["mean_temp"].mean()
df_pred = pd.DataFrame({
    "daylight_hours": X_line,
    "mean_temp": mean_temp_val  # Temperature held at its overall mean
})
df_pred_const = sm.add_constant(df_pred, has_constant='add')
y_line = model_overall.predict(df_pred_const)

plt.figure(figsize=(8,6))
plt.scatter(df_general["daylight_hours"], df_general["Daily Total"], alpha=0.7, label="Data Points")
plt.plot(X_line, y_line, color="red", label="Regression Line (Temp at Mean)")
plt.xlabel("Daylight Hours")
plt.ylabel("Daily Total Electricity Usage")
plt.title("Electricity Usage vs. Daylight Hours\n(Temperature Held at Mean)")
plt.legend()
plt.show()

In [None]:
df_no_august = df_final[df_final["Date"].dt.month != 8].copy()

# Create a binary "semester" column based on the month-day (ignoring year)
def is_semester(date):
    """Return 1 if the date falls within the semester periods:
       Jan 14 - Apr 05 or Sep 16 - Dec 22, else return 0.
    """
    m, d = date.month, date.day
    if (m == 1 and d >= 14) or (m in [2, 3]) or (m == 4 and d <= 5):
        return 1
    elif (m == 9 and d >= 16) or (m in [10, 11]) or (m == 12 and d <= 22):
        return 1
    else:
        return 0

df_no_august["semester"] = df_no_august["Date"].apply(is_semester)

# --- Statistical Testing with a Two-Sample t-test ---
semester_data = df_no_august[df_no_august["semester"] == 1]["Daily Total"]
nonsemester_data = df_no_august[df_no_august["semester"] == 0]["Daily Total"]

t_stat, p_val = ttest_ind(semester_data, nonsemester_data, equal_var=False)
print("T-statistic:", t_stat)
print("P-value:", p_val)

# --- Calculate 95% Confidence Intervals for Each Group ---
sem_mean = semester_data.mean()
nonsem_mean = nonsemester_data.mean()
sem_std = semester_data.std()
nonsem_std = nonsemester_data.std()
n_sem = len(semester_data)
n_nonsem = len(nonsemester_data)

sem_se = sem_std / np.sqrt(n_sem)
nonsem_se = nonsem_std / np.sqrt(n_nonsem)

sem_ci = (sem_mean - 1.96 * sem_se, sem_mean + 1.96 * sem_se)
nonsem_ci = (nonsem_mean - 1.96 * nonsem_se, nonsem_mean + 1.96 * nonsem_se)

print("Semester Mean:", sem_mean, "95% CI:", sem_ci)
print("Non-Semester Mean:", nonsem_mean, "95% CI:", nonsem_ci)

# --- Regression Model with Dummy Variable ---
# Model: Daily Total = β₀ + β₁ * semester
X_reg = sm.add_constant(df_no_august["semester"])  # Add constant term
y_reg = df_no_august["Daily Total"]
model_simple = sm.OLS(y_reg, X_reg).fit()
print(model.summary())

# --- Visualization: Bar Plot with Confidence Intervals ---
groups = ['Semester', 'Non-Semester']
means = [sem_mean, nonsem_mean]
errors = [1.96 * sem_se, 1.96 * nonsem_se]  # Error bars for 95% CI

plt.figure(figsize=(8,6))
plt.bar(groups, means, yerr=errors, capsize=10, color=['blue', 'orange'])
plt.ylabel('Mean Daily Total Electricity Usage')
plt.title('Comparison of Electricity Usage: Semester vs. Non-Semester (Excluding August)')
plt.show()

In [None]:
X = df_no_august[["semester", "mean_temp", "daylight_hours"]]
y = df_no_august["Daily Total"]

# 2. Add a constant (intercept)
X_const = sm.add_constant(X)

# 3. Fit the model
model_multi = sm.OLS(y, X_const).fit()
print("\nMultiple Regression Summary (Semester + Temp + Daylight):")
print(model_multi.summary())

In [None]:
df_no_august["mean_temp_squared"] = df_no_august["mean_temp"] ** 2

# Define predictors including the quadratic term
X_quad = df_no_august[["semester", "mean_temp", "mean_temp_squared", "daylight_hours"]]
X_quad_const = sm.add_constant(X_quad)
model_quad = sm.OLS(y, X_quad_const).fit()
print("\nRegression with Quadratic Temperature Term:")
print(model_quad.summary())

In [None]:
df_final["semester_label"] = df_final["semester"].map({1: "Semester", 0: "Non-Semester"})

# Group the data by semester_label and calculate the mean, standard deviation, and count
group_stats = df_final.groupby("semester_label")["Daily Total"].agg(["mean", "std", "count"]).reset_index()

# Calculate standard error and 95% CI for each group
group_stats["se"] = group_stats["std"] / np.sqrt(group_stats["count"])
group_stats["ci_lower"] = group_stats["mean"] - 1.96 * group_stats["se"]
group_stats["ci_upper"] = group_stats["mean"] + 1.96 * group_stats["se"]

print(group_stats)

# Create a bar plot with error bars
plt.figure(figsize=(8,6))
sns.set_theme(style="whitegrid", context="talk", palette="muted")

ax = sns.barplot(x="semester_label", y="mean", data=group_stats, 
                 palette=["#4c72b0", "#55a868"], edgecolor="black", capsize=0.1)
# Add error bars manually:
for i, row in group_stats.iterrows():
    ax.errorbar(i, row["mean"], 
                yerr=1.96 * row["se"], 
                fmt='none', 
                c='black', 
                capsize=10)

# Annotate with the mean values and sample sizes
for i, row in group_stats.iterrows():
    ax.text(i, row["mean"], f"\nmean={row['mean']:.1f}\nn={int(row['count'])}", 
            horizontalalignment='center', verticalalignment='bottom', fontsize=12)

plt.xlabel("Period")
plt.ylabel("Mean Daily Electricity Usage (kWh)")
plt.title("Comparison of Electricity Usage: Semester vs. Non-Semester")
plt.show()

In [None]:
df_final["semester_label"] = df_final["semester"].map({1: "Semester", 0: "Non-Semester"})

# Split the data by Semester and Non-Semester
semester_data = df_final.loc[df_final["semester_label"] == "Semester", "Daily Total"]
non_semester_data = df_final.loc[df_final["semester_label"] == "Non-Semester", "Daily Total"]

# Prepare the data for boxplot
data_to_plot = [semester_data, non_semester_data]

plt.figure(figsize=(8, 6))

# Create the boxplot with patch_artist=True so we can color the boxes
box = plt.boxplot(data_to_plot, labels=["Semester", "Non-Semester"], patch_artist=True, showfliers=False, widths=0.6 )

# Define a bright color palette
bright_colors = ["yellow", "green"]

# Color each box
for patch, color in zip(box['boxes'], bright_colors):
    patch.set_facecolor(color)


# Optionally, set the color of whiskers, caps, and medians (to keep them visible)
for whisker in box['whiskers']:
    whisker.set_color('black')
    whisker.set_linewidth(1.5)

for cap in box['caps']:
    cap.set_color('black')
    cap.set_linewidth(1.5)

for median in box['medians']:
    median.set_color('red')
    median.set_linewidth(2)


plt.xlabel("Period")
plt.ylabel("Daily Electricity Usage (kWh)")

plt.show()

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(15, 10))

# 1. Simple Regression (Semester Only) Visualization
plt.subplot(2, 2, 1)
sns.boxplot(x='semester', y='Daily Total', data=df_no_august)
plt.title("Daily Energy Use by Semester Status")
plt.xlabel("Semester (1 = In Session)")
plt.ylabel("Energy (kWh)")

# 2. Multiple Regression - Temperature Effect
plt.subplot(2, 2, 2)
sns.regplot(x='mean_temp', y='Daily Total', data=df_no_august, 
            scatter_kws={'alpha':0.3}, line_kws={'color':'red'})
plt.title("Energy vs. Temperature")
plt.xlabel("Mean Temperature (°C)")
plt.ylabel("Energy (kWh)")

# 3. Multiple Regression - Daylight Effect
plt.subplot(2, 2, 3)
sns.regplot(x='daylight_hours', y='Daily Total', data=df_no_august,
            scatter_kws={'alpha':0.3}, line_kws={'color':'green'})
plt.title("Energy vs. Daylight Hours")
plt.xlabel("Daylight Hours")
plt.ylabel("Energy (kWh)")

# 4. Quadratic Temperature Effect
plt.subplot(2, 2, 4)
sns.regplot(x='mean_temp', y='Daily Total', data=df_no_august, 
            order=2, scatter_kws={'alpha':0.3}, 
            line_kws={'color':'purple'})
plt.title("Quadratic Temperature Effect")
plt.xlabel("Mean Temperature (°C)")
plt.ylabel("Energy (kWh)")

plt.tight_layout()
plt.show()

# Coefficient Plot for Model Comparison
models = {
    
    'Multiple': model_multi,
    'Quadratic': model_quad
}

coefs = []
for name, model in models.items():
    if name == 'Simple':
        coefs.append({'Model': name, 'Variable': 'Semester', 
                     'Coefficient': model.params[1], 'CI_low': model.conf_int().iloc[1,0], 
                     'CI_high': model.conf_int().iloc[1,1]})
    else:
        for i, var in enumerate(model.params.index[1:]):  # Skip intercept
            coefs.append({'Model': name, 'Variable': var,
                         'Coefficient': model.params[i+1],
                         'CI_low': model.conf_int().iloc[i+1,0],
                         'CI_high': model.conf_int().iloc[i+1,1]})

coef_df = pd.DataFrame(coefs)
plt.figure(figsize=(12, 6))

# Create position offsets for dodging
positions = []
model_names = coef_df['Model'].unique()
n_models = len(model_names)
dodge_width = 0.4

for i, model in enumerate(model_names):
    model_df = coef_df[coef_df['Model'] == model]
    x_pos = np.arange(len(model_df)) + (i - (n_models-1)/2) * dodge_width/n_models
    positions.extend(x_pos)
    
    plt.errorbar(x=x_pos, 
                y=model_df['Coefficient'],
                xerr=None,
                yerr=[model_df['Coefficient'] - model_df['CI_low']], 
                fmt='o',
                capsize=5,
                label=model)

plt.axhline(0, color='gray', linestyle='--')
plt.xticks(np.arange(len(coef_df['Variable'].unique())), 
           coef_df['Variable'].unique())
plt.title("Regression Coefficients Comparison")
plt.xlabel("Predictor Variable")
plt.ylabel("Coefficient Value")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
df_semester = df_final[df_final["semester"] == 1]
df_nonsemester = df_final[df_final["semester"] == 0]

# --- SEMESTER MODEL ---
X_sem = df_semester[["daylight_hours"]]
y_sem = df_semester["Daily Total"]
X_sem_const = sm.add_constant(X_sem)
model_sem = sm.OLS(y_sem, X_sem_const).fit()

# --- NON-SEMESTER MODEL ---
X_nonsem = df_nonsemester[["daylight_hours"]]
y_nonsem = df_nonsemester["Daily Total"]
X_nonsem_const = sm.add_constant(X_nonsem)
model_nonsem = sm.OLS(y_nonsem, X_nonsem_const).fit()

# Define a common range of x-values (daylight hours) for predictions
min_x = df_final["daylight_hours"].min()
max_x = df_final["daylight_hours"].max()
X_line = np.linspace(min_x, max_x, 100)  

# Build DataFrames for prediction, adding constant
df_pred_sem = pd.DataFrame({"daylight_hours": X_line})
df_pred_sem_const = sm.add_constant(df_pred_sem)
y_line_sem = model_sem.predict(df_pred_sem_const)

df_pred_nonsem = pd.DataFrame({"daylight_hours": X_line})
df_pred_nonsem_const = sm.add_constant(df_pred_nonsem)
y_line_nonsem = model_nonsem.predict(df_pred_nonsem_const)

# Combine into DataFrames you can print or export
df_line_sem = pd.DataFrame({
    "daylight_hours": X_line,
    "predicted_usage": y_line_sem,
    "group": "Semester"
})
df_line_nonsem = pd.DataFrame({
    "daylight_hours": X_line,
    "predicted_usage": y_line_nonsem,
    "group": "Non-Semester"
})

df_line_combined = pd.concat([df_line_sem, df_line_nonsem], ignore_index=True)

print("\nSemester Regression Points:")
print(df_line_sem.head(10))

print("\nNon-Semester Regression Points:")
print(df_line_nonsem.head(10))

print("\nAll Combined:")
print(df_line_combined.head(20))

In [None]:
sns.set_theme(style="darkgrid", context="talk")

g = sns.lmplot(
    x="mean_temp",
    y="Daily Total",
    hue="semester_label",
    scatter_kws={"alpha": 0.4, "s": 20},
    line_kws={"linewidth": 2},
    data=df_final,
    legend=False,  # Disable default legend
    height=6, 
    aspect=1.2
)

# Add a legend
g.add_legend(title="Semester Group")

# Grab the legend object
legend = g._legend

# Reposition the legend to the right, for instance
legend.set_bbox_to_anchor((1, 1))
legend.set_title("Semester Group")

# Adjusting the legend’s frame
frame = legend.get_frame()
frame.set_edgecolor("black")        # Outline color
frame.set_linewidth(7.0)            # Outline thickness

# If you want extra padding inside the frame, do:
# frame.set_boxstyle("Square", pad=0.5)

g.set_xlabels("Mean Temperature (°C)")
g.set_ylabels("Daily Total Electricity Usage (kWh)")


plt.tight_layout()
plt.show()


In [None]:
sns.set_theme(style="darkgrid", context="talk")

g = sns.lmplot(
    x="daylight_hours",
    y="Daily Total",
    hue="semester_label",
    scatter_kws={"alpha": 0.4, "s": 20},
    line_kws={"linewidth": 2},
    data=df_final,
    legend=False,  # Disable default legend
    height=6, 
    aspect=1.2
)

# Add a legend
g.add_legend(title="Semester Group")

# Grab the legend object
legend = g._legend

# Reposition the legend to the right, for instance
legend.set_bbox_to_anchor((1, 1))
legend.set_title("Semester Group")

# Adjusting the legend’s frame
frame = legend.get_frame()
frame.set_edgecolor("black")        # Outline color
frame.set_linewidth(7.0)            # Outline thickness

# If you want extra padding inside the frame, do:
# frame.set_boxstyle("Square", pad=0.5)

g.set_xlabels("Daylight Hours")
g.set_ylabels("Daily Total Electricity Usage (kWh)")


plt.tight_layout()
plt.show()