# Machine Learning

### 1. Can you project the sales amount in each nation in the next ... months/ years?  

## Import Library

In [None]:
# Import Library
import pandas as pd
import os

import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


pd.options.mode.chained_assignment = None
pd.options.display.float_format = '{:.2f}'.format

## Prepare Data

In [None]:
data_dir = "clean_data"
OCNR = pd.read_csv(f"{data_dir}/OCNR.csv")
SNR = pd.read_csv(f"{data_dir}/SNR.csv")
L = pd.read_csv(f"{data_dir}/L.csv")
P = pd.read_csv(f"{data_dir}/P.csv")
PS = pd.read_csv(f"{data_dir}/PS.csv")

In [None]:
LOCNR = pd.merge(left = L,
                 right=OCNR,
                 left_on='L_ORDERKEY',
                 right_on='O_ORDERKEY',
                 how='inner')

In [None]:
AGG_df = LOCNR.groupby(['C_NATION','C_REGION','O_ORDERYEAR','O_ORDERMONTH'],as_index=False).agg({
    'L_QUANTITY':'sum',
    'L_DISCOUNT':'mean',
    'L_EXTENDEDPRICE':'mean',
    'L_TAX':'mean',
    'LEADDAY':'mean',
    'O_TOTALPRICE':'sum'
})

In [None]:
AGG_df.head()

In [None]:
AGG_df['MONTHYEAR'] = pd.to_datetime(dict(year=AGG_df.O_ORDERYEAR, month=AGG_df.O_ORDERMONTH, day=1))

## Linear Regression

### Split by Nation

In [None]:
by_nations = []
for nation in AGG_df['C_NATION'].unique():
    nation_df = AGG_df[AGG_df['C_NATION']==nation]
    by_nations.append(nation_df)

### Train Model

In [None]:
# Predict totalprice up to a specified end date
def predict_totaL_price_until_date(_df: pd.DataFrame, end_date:str, model: Pipeline):
    _nation = _df['C_NATION'].iloc[0]
    _region = _df['C_REGION'].iloc[0]

    # Convert back to datetime to generate period via date_range
    _df['MONTHYEAR'] = pd.to_datetime(_df['MONTHYEAR'], format='%Y%m')

    # Create a DataFrame for prediction up to end_date
    future_dates = pd.date_range(start=_df['MONTHYEAR'].max(), end=end_date, freq='ME').to_period('M').strftime('%Y%m')
    future_df = pd.DataFrame({
        'C_REGION': [f'{_region}'] * len(future_dates),
        'C_NATION': [f'{_nation}'] * len(future_dates),
        'MONTHYEAR': future_dates
    })

    # Converse into one format yyyymm
    _df['MONTHYEAR'] = _df['MONTHYEAR'].dt.strftime('%Y%m')
    _df = pd.concat([_df, future_df])

    # Ensure monthyear is eligible datetype
    # Re-convert monthyear from period (date_range) back to int
    _df['MONTHYEAR'] = _df['MONTHYEAR'].astype(int)
    _df['PREDICTEDTOTALPRICE'] = model.predict(_df[['C_REGION', 'C_NATION', 'MONTHYEAR']])
    
    return _df

# Plotting
def plot_actual_vs_predicted(_df: pd.DataFrame):
    # String of nation name
    _nation_str = _df['C_NATION'].iloc[0].lower()
    # _dates = _df['MONTHYEAR'].unique()
    # _min_date = pd.to_datetime(min(_dates))
    # _max_date = pd.to_datetime(max(_dates))

    # Plot the results
    plt.figure(figsize=(10, 6))

    # Plot actual TOTAL_PRICE
    plt.scatter(_df['MONTHYEAR'],
             _df['O_TOTALPRICE'],
             marker='o',
             color='b',
             label='Actual TOTAL PRICE'
             )
    
    # Plot predicted TOTAL_PRICE for the entire dataset
    plt.plot(_df['MONTHYEAR'],
             _df['PREDICTEDTOTALPRICE'],
             marker='o',
             linestyle='-',
             color='r',
             label='PREDICTED TOTAL PRICE'
             )
    # plt.xlim(_min_date,_max_date)

    plt.xlabel('MONTH YEAR')
    plt.ylabel('TOTAL PRICE')
    plt.title(f'{_nation_str.upper()}')
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()

    return plt

### Result

In [None]:
results = []
result_df = pd.DataFrame()

# _i=0
# by_nations = []
# for nation in AGG_df['C_NATION'].unique():
#     nation_df = AGG_df[AGG_df['C_NATION']==nation]
#     by_nations.append(nation_df)
# for _nation_df in by_nations:

for _i, (nation, _nation_df) in enumerate(AGG_df.groupby("C_NATION")):
    # Define feature, yhat
    X = _nation_df[['C_REGION', 'C_NATION', 'MONTHYEAR']]
    y = _nation_df['O_TOTALPRICE']

    # String of nation name
    _nation_str = X['C_NATION'].iloc[0].lower()

    # Ensure monthyear is eligible datetype
    X['MONTHYEAR'] = X['MONTHYEAR'].dt.strftime('%Y%m').astype(int)

    # One-hot encode, categorical varialbels using sklearn column transformer
    _preprocessor = ColumnTransformer(
        transformers=[
            # drop first category in each feature
            # Default is None
            ('category', OneHotEncoder(drop='first'), ['C_REGION', 'C_NATION'])
        ],
        remainder='passthrough' # don't encode month year
    )

    # Create a pipeline with preprocessor and Linear Regression model
    model = Pipeline(steps=[
        ('preprocessor', _preprocessor),
        ('regressor', LinearRegression())
    ])

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234) # random state is basically random seed

    # Fit the model
    model.fit(X, y)

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Print immediate result
    print('___________________________')
    print(f'Country: {_nation_str}')
    print(f'Mean Squared Error: {mse}')
    print(f'R-squared: {r2}')

    # Get model coefficients
    regressor = model.named_steps['regressor']
    feature_names = _preprocessor.named_transformers_['category'].get_feature_names_out(['C_REGION', 'C_NATION']).tolist() + ['MONTHYEAR']
    coefficients = regressor.coef_

    print('Coefficients:')
    for feature, coef in zip(feature_names, coefficients):
        print(f'{feature}: {coef}')

    # Specify the end date for predictions
    end_date = '2000-03-01'

    # Combine back X and y
    X = pd.concat([X, y], axis=1)
    X = X.sort_values(by=['MONTHYEAR'])

    # Predict TOTAL_PRICE up to the specified end date
    _extented_df = predict_totaL_price_until_date(X, end_date, model)
    
    # Convert back to datetime for X axis
    _extented_df['MONTHYEAR'] = pd.to_datetime(_extented_df['MONTHYEAR'], format='%Y%m')

    # Plot
    _plot = plot_actual_vs_predicted(_extented_df)

    results.append((_i, _nation_str, _extented_df, model, _plot))
    result_df = pd.concat([result_df, _extented_df])
    # _i+=1

## Download Data

In [None]:
#Create output folder if not exist
if not os.path.exists("output"):
    os.makedirs("output")

data_dir = "output"

#Download modeled data into output folder
result_df.to_csv(f"{data_dir}/Lab1-1.csv")