In [1]:
import pandas as pd
import numpy as np
from plotly import __version__
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

In [2]:
unrate = pd.read_csv("Unemployment_Rate.csv", parse_dates=["observation_date"])
caur = pd.read_csv("Unemployment_Rate_in_California.csv", parse_dates=["observation_date"])

In [3]:
data = pd.merge(unrate, caur, on="observation_date", suffixes=("_nation", "_california"))

In [4]:
print(data.head())

  observation_date  UNRATE  CAUR
0       1976-01-01     7.9   9.2
1       1976-02-01     7.7   9.2
2       1976-03-01     7.6   9.1
3       1976-04-01     7.7   9.1
4       1976-05-01     7.4   9.0


In [5]:
print(data.isnull().sum())
print(data.describe())

observation_date    0
UNRATE              0
CAUR                0
dtype: int64
                    observation_date      UNRATE        CAUR
count                            587  587.000000  587.000000
mean   2000-05-31 20:51:06.439522944    6.113799    7.144804
min              1976-01-01 00:00:00    3.400000    3.800000
25%              1988-03-16 12:00:00    4.700000    5.300000
50%              2000-06-01 00:00:00    5.800000    6.800000
75%              2012-08-16 12:00:00    7.200000    8.400000
max              2024-11-01 00:00:00   14.800000   16.100000
std                              NaN    1.757305    2.195493


In [6]:
import plotly.express as px

fig = px.scatter(data, x="UNRATE", y="CAUR",
                 title="National vs California Unemployment Rate",
                 labels={"UNRATE": "National Unemployment Rate (%)",
                         "CAUR": "California Unemployment Rate (%)"},
                 opacity=0.7)
fig.show()

In [7]:
import numpy as np
from sklearn.linear_model import LinearRegression

X = data["UNRATE"].values.reshape(-1, 1)
y = data["CAUR"].values

model = LinearRegression()
model.fit(X, y)
data['Prediction'] = model.predict(X)

fig = px.scatter(data, x="UNRATE", y="CAUR",
                 title="National vs California Unemployment Rate with Regression Line",
                 labels={"UNRATE": "National Unemployment Rate (%)",
                         "CAUR": "California Unemployment Rate (%)"})
fig.add_scatter(x=data["UNRATE"], y=data["Prediction"], mode='lines', name='Regression Line')
fig.show()

In [8]:
import statsmodels.api as sm

X_with_const = sm.add_constant(data["UNRATE"])
model = sm.OLS(data["CAUR"], X_with_const).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   CAUR   R-squared:                       0.841
Model:                            OLS   Adj. R-squared:                  0.841
Method:                 Least Squares   F-statistic:                     3092.
Date:                Mon, 20 Jan 2025   Prob (F-statistic):          1.09e-235
Time:                        18:58:31   Log-Likelihood:                -754.50
No. Observations:                 587   AIC:                             1513.
Df Residuals:                     585   BIC:                             1522.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1404      0.131      1.071      0.2