# Task-6 Complete Solution: Time Series (ARIMA) + Logistic Regression (Heart Disease)

This notebook will:
1) Create (or load if present) datasets.
2) Perform full analysis for both projects.
3) Generate all deliverables (plots, tables, saved models/files).

In [None]:
import os
import io
import math
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score, mean_squared_error

from statsmodels.tsa.arima.model import ARIMA

# Helper: Safe MAPE
def mape(y_true, y_pred):
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    eps = 1e-8
    return np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), eps))) * 100.0

# Part A: Time Series Analysis with ARIMA

In [None]:
# 1) Create synthetic daily sales dataset (3 years)
np.random.seed(42)
start_date = datetime(2022, 1, 1)
dates = pd.date_range(start_date, periods=3*365, freq='D')

t = np.arange(len(dates))
trend = 0.05 * t
weekly = 10 * np.sin(2 * np.pi * t / 7)
annual = 20 * np.sin(2 * np.pi * t / 365)
noise = np.random.normal(0, 5, len(dates))

base = 200 + trend + weekly + annual + noise
sales = np.maximum(0, base).round(0)

ts_df = pd.DataFrame({'Date': dates, 'Sales': sales})
ts_df.to_csv('sales_timeseries.csv', index=False)

print("Time series dataset created with", len(ts_df), "records")

In [None]:
# 2) Visualization (trend + moving average)
ts_df['MA_30'] = ts_df['Sales'].rolling(window=30, min_periods=1).mean()
plt.figure(figsize=(10,5))
plt.plot(ts_df['Date'], ts_df['Sales'], label='Sales', linewidth=1)
plt.plot(ts_df['Date'], ts_df['MA_30'], label='30-day MA', linewidth=2)
plt.title('Sales Trend with 30-Day Moving Average')
plt.legend()
plt.tight_layout()
plt.savefig('sales_trend.png')
plt.show()
plt.close()

In [None]:
# 3) Train/Test split: last 90 days as test
train_df = ts_df.iloc[:-90].copy()
test_df = ts_df.iloc[-90:].copy()

y_train = train_df['Sales'].astype(float)

print("Training data:", len(train_df), "records")
print("Test data:", len(test_df), "records")

In [None]:
# Find best ARIMA order via a comprehensive grid search
best_order = None
best_aic = np.inf
for p in range(3):  # 0, 1, 2
    for d in range(2):  # 0, 1
        for q in range(3):  # 0, 1, 2
            try:
                model = ARIMA(y_train, order=(p, d, q))
                res = model.fit()
                if res.aic < best_aic:
                    best_aic = res.aic
                    best_order = (p, d, q)
                print(f"ARIMA({p},{d},{q}): AIC = {res.aic:.2f}")
            except Exception as e:
                print(f'Error fitting ARIMA({p},{d},{q}): {e}')

print(f"\nBest ARIMA order: {best_order} with AIC: {best_aic:.2f}")

In [None]:
# Fit ARIMA with best order
model = ARIMA(y_train, order=best_order)
res = model.fit()

steps = len(test_df)
forecast_res = res.get_forecast(steps=steps)
forecast_mean = forecast_res.predicted_mean
forecast_ci = forecast_res.conf_int(alpha=0.05)

# Evaluate
rmse = math.sqrt(mean_squared_error(test_df['Sales'], forecast_mean))
mape_val = mape(test_df['Sales'], forecast_mean)

print(f"RMSE: {rmse:.2f}")
print(f"MAPE: {mape_val:.2f}%")

In [None]:
# Save forecast results
forecast_table = pd.DataFrame({
    'Date': test_df['Date'].values,
    'Actual_Sales': test_df['Sales'].values,
    'Forecast_Sales': np.round(forecast_mean.values, 2),
    'Lower_95_CI': np.round(forecast_ci.iloc[:, 0].values, 2),
    'Upper_95_CI': np.round(forecast_ci.iloc[:, 1].values, 2),
})
forecast_table.to_csv('forecasted_sales.csv', index=False)
print("Forecast results saved to forecasted_sales.csv")

In [None]:
# Forecast plot
plt.figure(figsize=(10,5))
plt.plot(train_df['Date'], train_df['Sales'], label='Train', linewidth=1)
plt.plot(test_df['Date'], test_df['Sales'], label='Actual', linewidth=1)
plt.plot(test_df['Date'], forecast_mean, label=f'Forecast ARIMA{best_order}', linewidth=2)
plt.fill_between(test_df['Date'], forecast_ci.iloc[:, 0], forecast_ci.iloc[:, 1], alpha=0.2)
plt.title(f'ARIMA Forecast vs Actual | RMSE={rmse:.2f}, MAPE={mape_val:.2f}%')
plt.legend()
plt.tight_layout()
plt.savefig('sales_forecast.png')
plt.show()
plt.close()

# Part B: Logistic Regression — Heart Disease

In [None]:
csv_path = 'heart_disease.csv'
if not os.path.exists(csv_path):
    # Create synthetic dataset
    np.random.seed(7)
    N = 1000
    age = np.random.randint(29, 78, size=N)
    gender = np.random.choice(['Male', 'Female'], size=N, p=[0.6, 0.4])
    cholesterol = np.random.normal(220, 35, size=N).clip(120, 380).astype(int)
    systolic = np.random.normal(135, 18, size=N).clip(90, 220).astype(int)
    diastolic = np.random.normal(85, 12, size=N).clip(50, 140).astype(int)

    gender_flag = (gender == 'Male').astype(int)
    risk = (
        0.04*(age-50) + 0.02*(cholesterol-200) + 0.03*(systolic-120) + 0.02*(diastolic-80)
        + 0.2*gender_flag + np.random.normal(0,1.0,size=N)
    )
    prob = 1/(1+np.exp(-0.02*(risk-np.mean(risk))))
    heart_disease = (np.random.rand(N) < prob).astype(int)

    heart_df = pd.DataFrame({
        'Age': age,
        'Gender': gender,
        'Cholesterol': cholesterol,
        'Blood Pressure': [f'{s}/{d}' for s,d in zip(systolic, diastolic)],
        'Heart Disease': heart_disease
    })
    heart_df.to_csv(csv_path, index=False)
    print("Heart disease dataset created with", len(heart_df), "records")
else:
    heart_df = pd.read_csv(csv_path)
    print("Loaded existing heart disease dataset with", len(heart_df), "records")

In [None]:
# Data preprocessing
df = heart_df.drop_duplicates()
df = df.ffill().bfill()
df['Systolic'] = df['Blood Pressure'].apply(lambda x: int(str(x).split('/')[0]))
df['Diastolic'] = df['Blood Pressure'].apply(lambda x: int(str(x).split('/')[1]))
df['Gender'] = df['Gender'].map({'Male':1, 'Female':0})

X = df[['Age','Gender','Cholesterol','Systolic','Diastolic']].astype(float)
y = df['Heart Disease'].astype(int)

print("Features shape:", X.shape)
print("Target shape:", y.shape)

In [None]:
# Train logistic regression model
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.25,random_state=42,stratify=y)
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

acc = accuracy_score(y_test,y_pred)
prec = precision_score(y_test,y_pred,zero_division=0)
rec = recall_score(y_test,y_pred,zero_division=0)
f1 = f1_score(y_test,y_pred,zero_division=0)
cm = confusion_matrix(y_test,y_pred)

print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1-score:  {f1:.4f}")

In [None]:
# Save evaluation report
with open('heart_eval_report.txt','w') as f:
    f.write('Logistic Regression — Heart Disease Evaluation\n')
    f.write(f'Accuracy:  {acc:.4f}\nPrecision: {prec:.4f}\nRecall: {rec:.4f}\nF1-score: {f1:.4f}\n')
    f.write('\nClassification Report:\n')
    f.write(classification_report(y_test,y_pred,zero_division=0))

print("Evaluation report saved to heart_eval_report.txt")

In [None]:
# Confusion matrix plot
plt.figure(figsize=(5,4))
plt.imshow(cm, cmap='Blues')
plt.title('Confusion Matrix (Heart Disease)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.colorbar()
for (i,j),v in np.ndenumerate(cm):
    plt.text(j,i,str(v),ha='center',va='center', fontweight='bold')
plt.tight_layout()
plt.savefig('heart_confusion_matrix.png')
plt.show()
plt.close()

In [None]:
# Save model and scaler
with open('heart_lr_model.pkl','wb') as f:
    pickle.dump(lr,f)
with open('heart_scaler.pkl','wb') as f:
    pickle.dump(scaler,f)

print("Model and scaler saved successfully")
print('✅ All deliverables generated successfully in the working folder!')