In [3]:
# Add py_package path to system PYTHONPATH file

from sys import path

from os.path import expanduser as expanduser
from os.path import join as join

py_path = expanduser("~") 
for p in ["Documents", "CallCenterStaffing", "DataScripts", "py_package"]:
    py_path = join(py_path, p)

if py_path not in path:
    path.append(py_path)

In [12]:
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from importlib import reload
from sklearn import datasets, linear_model
from py_package import db_conn as db

conn_dict = { 
    "user"     : "szymonbocian",   \
    "password" : "",               \
    "host"     : "localhost",      \
    "port"     : "5432",           \
    "database" : "dwh_call_center"
}

lin_regr_sets_dict = {
    "sets_year"      : 2019, \
    "train_set_week" : 11,   \
    "test_set_week"  : 12
}

db = db.DbConn(conn_dict)

train_X_sql_query = "SELECT sdh.* FROM snap.service_date_hour_vw sdh INNER JOIN tech.date_dim dd ON sdh.date_key = dd.date_key WHERE dd.iso_week_of_year = '%(train_set_week)s' AND dd.year_value = %(sets_year)d;" % lin_regr_sets_dict
test_Y_sql_query = "SELECT sdh.* FROM snap.service_date_hour_vw sdh INNER JOIN tech.date_dim dd ON sdh.date_key = dd.date_key WHERE dd.iso_week_of_year = '%(test_set_week)s' AND dd.year_value = %(sets_year)d;" % lin_regr_sets_dict

sql_dict = {
    "train_X"      : train_X_sql_query,
    "sets_columns" : "SELECT column_name FROM INFORMATION_SCHEMA.columns WHERE table_schema = 'snap' AND table_name = 'service_date_hour_vw';",
    "test_Y"       : test_Y_sql_query
}

col = [t[0] for t in db.run_sql(sql_dict["sets_columns"])]
train_X_df = pd.DataFrame(db.run_sql(sql_dict["train_X"]), columns = col)
test_Y_df  = pd.DataFrame(db.run_sql(sql_dict["test_Y"]), columns = col)

train_X_df["shift"] = train_X_df.apply(lambda x: "1" if x.hour_key < 16 else "2", axis = 1)
test_Y_df["shift"]  = test_Y_df.apply(lambda x: "1" if x.hour_key < 16 else "2", axis = 1)

train_X_df.head(1)
test_Y_df.head(1)

Unnamed: 0,service_name,service_key,date_value,date_key,hour_key,start_hour,end_hour,period_hour,avg_handle_time_sec,avg_handle_time_min,total_call,shift
0,Billing,2,2019-03-18,20190318,6,06:00,06:59,06:00 - 06:59,356,6,4,1


In [22]:
# adjust train to test or test to train sets depends on both sets size
if train_X_df["total_call"].size > test_Y_df["total_call"].size:
    l_size = test_Y_df["total_call"].size
else:
    l_size = train_X_df["total_call"].size
    
total_call_num_X = train_X_df["total_call"][:l_size]
total_call_num_Y = test_Y_df["total_call"][:l_size]

print(l_size, total_call_num_X.size, total_call_num_Y.size)

907 907 907


In [23]:
# linear regresion implementation

# total_call_num_X = train_X_df["total_call"] 
# total_call_num_Y = test_Y_df["total_call"]

total_call_num_X_train = total_call_num_X.values.reshape(-1, 1)
total_call_num_Y_test = total_call_num_Y.values.reshape(-1, 1)

regr = linear_model.LinearRegression()

regr.fit(total_call_num_X_train, total_call_num_Y_test)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean square error
print("Residual sum of squares: %.2f"
      % np.mean((regr.predict(total_call_num_X_train) - total_call_num_Y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(total_call_num_X_train, total_call_num_Y_test))


Coefficients: 
 [[0.77109915]]
Residual sum of squares: 729.60
Variance score: 0.54
