<a href="https://colab.research.google.com/github/RobertGrados/RobertGrados/blob/main/HW8_RG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Given the comprehensive dataset provided from Zillow, which encompasses various features ranging from architectural details to taxation information, students are tasked to
# apply unsupervised learning methods to uncover underlying patterns in the data. The
# assignment will involve two main techniques: Principal Component Analysis (PCA) and
# Clustering.

In [None]:
!pip install -U dataprep

In [None]:
from dataprep.eda import plot, plot_correlation, plot_missing
from sklearn.metrics import classification_report

In [None]:
import pandas as pd
df = pd.read_csv("Homework8.csv")
df.head(10)

In [None]:
from dataprep.eda import plot, plot_correlation, plot_missing
plot(df['merchant'])

In [None]:
df = pd.read_csv("Homework8.csv", index_col='time',parse_dates=True)

In [None]:
df = df[(df.index >= "2033-01-01") & (df.index < "2035-01-01")]

In [None]:
daily_transactions = df.resample("D")["amount_usd_in_cents"].sum()
# Save the resampled data (optinal)
daily_transactions.to_csv("daily_transactions.csv")

In [None]:
print(daily_transactions.head())

In [None]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import matplotlib.pyplot as plt

data=daily_transactions

#Define the model
model = ExponentialSmoothing(data, seasonal='mul', seasonal_periods=7, trend="additive")  # Adjust for weekly seasonality (7 days)
model_fit = model.fit()

# Forecast future n days (replace 30 with your desired number)
futere_dates = data.index[-1] + pd.DateOffset(days=30) # Add 30 days to the Last date
forecast = model_fit.forecast(steps=30)

print(forecast)
forecast.plot(label="Forecast")
data.plot(label="Actual")
plt.legend()
plt.show()

In [None]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import matplotlib.pyplot as plt

data=daily_transactions

# Holt-Winters model (same as before)
model = ExponentialSmoothing(data, seasonal='add', seasonal_periods=7)
model_fit = model.fit()
forecast = model_fit.forecast(steps=30)

# Plotting
plt.figure(figsize=(12, 6)) # Adjust figure size if needed
plt.plot(data, label='Historical Data',linestyle='-')
plt.plot(forecast, label='Forecast',linestyle='--')
plt.title('Daily Transaction Data and Holt-Winters Forecast')
plt.xlabel('Date')
plt.ylabel('Transaction Sum (Dollars)')
plt.legend()
plt.show()

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.holtwinters import ExponentialSmoothing  # Assuming you're using Holt-Winters


daily_data = daily_transactions

# Define window size and number of folds
window_size = 30  # Adjust window size as needed
folds = 5

# Create time series split object
cv = TimeSeriesSplit(n_splits=folds)

# Evaluate model performance (e.g., using Mean Squared Error)
mse_scores = []
for train, test in cv.split(daily_data):
    model = ExponentialSmoothing(daily_data[train], seasonal='add', seasonal_periods=7)
    model_fit = model.fit()
    forecast = model_fit.forecast(steps=len(test))
    mse = ((daily_data[test] - forecast)**2).mean()  # Mean Squared Error
    mse_scores.append(mse)

# Print average MSE across folds
average_mse = sum(mse_scores) / len(mse_scores)
print(f"Average Mean Squared Error (MSE) for Holt-Winters: {average_mse}")

In [None]:
# Define number of lags to consider
lags = 7  # Adjust as needed

# Create a DataFrame to store lagged features
df_features = pd.DataFrame()
df_features['original_data'] = daily_transactions

# Add lagged features
for i in range(1, lags + 1):
    df_features[f'lag_{i}'] = daily_data.shift(i)

# Drop the first 'lags' rows (contain NaN values due to shifting)
df_features = df_features.dropna()

# Separate features and target variable (assuming daily_transactions is the target)
X = df_features.drop('original_data', axis=1)
y = df_features['original_data']

# Select features to plot (adjust the number based on your 'lags')
features_to_plot = ['original_data', 'lag_1', 'lag_2', 'lag_3']

# Plotting
plt.figure(figsize=(12, 6))
for feature in features_to_plot:
    plt.plot(df_features[feature], label=feature)

plt.title('Original Transaction Data and Lagged Features')
plt.xlabel('Time Index')
plt.ylabel('Transaction Sum (Dollars)')
plt.legend()
plt.show()

In [None]:
# Linear Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# ... your code for loading data and creating lagged features (df_features)
data = pd.read_csv('daily_transactions.csv')

# Define how many lags to create
lag_periods = [1, 7, 30]

# Create lagged features
for lag in lag_periods:
  data[f'transction_lag_{lag}'] = data["amount_usd_in_cents"].shift(lag)
  # Drop initial row with missing values (NaN)
  data = data.dropna()

data.head()

# Separate features and target variable
X = df_features.drop('original_data', axis=1)
y = df_features['original_data']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate performance using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f'Linear Regression Mean Squared Error (MSE): {mse}')

# Plotting predictions vs. actual values
plt.figure(figsize=(12, 6))
plt.plot(y_test, label='Actual')
plt.plot(y_pred, label='Predicted')
plt.title('Linear Regression: Actual vs. Predicted Transactions')
plt.xlabel('Time Index')
plt.ylabel('Transaction Sum (Dollars)')
plt.legend()
plt.show()

In [None]:
# Forcasting Methods
!python -m pip install statsmodels
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.model_selection import TimeSeriesSplit

cv =TimeSeriesSplit(n_splits=3)
mae_scores = []

def holt_winters_forecast(steps=len(test_data))
return