In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Regression models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Classification models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Model evaluation
from sklearn import metrics

## Load the Data

In [2]:
bow = pd.read_csv('bag_of_words.csv')
sales = pd.read_csv('sales.csv')

x = bow.iloc[:, 1:] # Exclude "quarter_statement"

The procedures are similar for all following models:
1. Set the dependent variable
2. Randomly split dataset with 80% (28 observations) in training set
3. Fit a model with training data
4. Use the fitted model to predict test data
5. Evaluate the differences between predictions and actual test values

# Regression Models

## Linear Regression

In [3]:
y = sales['growth']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

lin = LinearRegression().fit(x_train, y_train)

lin_pred = lin.predict(x_test)

# Mean squared error
metrics.mean_squared_error(y_test, lin_pred)

0.005586636342396019

## Random Forest Regressor

In [4]:
y = sales['growth']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

rfr = RandomForestRegressor(
    n_estimators=100, max_depth=8, min_samples_leaf=1, random_state=10)\
    .fit(x_train, y_train)

rfr_pred = rfr.predict(x_test)

metrics.mean_squared_error(y_test, rfr_pred)

0.001619217516145528

## XGBoost Regressor

In [5]:
y = sales['growth']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

xgbr = XGBRegressor().fit(x_train, y_train)

xgbr_pred = xgbr.predict(x_test)

metrics.mean_squared_error(y_test, xgbr_pred)

0.004379813386805549

# Classification Models

## Naive Bayes

In [6]:
# y is 1 if "growth" is greater than 0, 0 otherwise
y = (sales['growth'] > 0).astype(int)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

nb = MultinomialNB().fit(x_train, y_train)

nb_pred = nb.predict(x_test)

# Predictive accuracy and confusion matrix
print(metrics.accuracy_score(y_test, nb_pred))
metrics.confusion_matrix(y_test, nb_pred, labels=[0, 1])

0.5


array([[0, 4],
       [0, 4]], dtype=int64)

In [7]:
# y is 1 if "growth" is greater than its median, 0 otherwise
y = (sales['growth'] > sales['growth'].median()).astype(int)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

nb = MultinomialNB().fit(x_train, y_train)

nb_pred = nb.predict(x_test)
print(metrics.accuracy_score(y_test, nb_pred))
metrics.confusion_matrix(y_test, nb_pred, labels=[0, 1])

0.75


array([[0, 2],
       [0, 6]], dtype=int64)

In [8]:
# y has 3 classes, each class has 12 observations
def class_3(growth):
    if growth < -0.062:
        return 0
    elif growth < -0.007:
        return 1
    else:
        return 2

y = sales['growth'].map(class_3)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

nb = MultinomialNB().fit(x_train, y_train)

nb_pred = nb.predict(x_test)
metrics.accuracy_score(y_test, nb_pred)

0.5

In [9]:
# y has 4 classes, each class has 9 observations
def class_4(growth):
    if growth < -0.075:
        return 0
    elif growth < -0.03:
        return 1
    elif growth < 0.1:
        return 2
    else:
        return 3
   
y = sales['growth'].map(class_4)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

nb = MultinomialNB().fit(x_train, y_train)

nb_pred = nb.predict(x_test)
metrics.accuracy_score(y_test, nb_pred)

0.375

## Logistic Regression (Softmax for Multi-class)

In [10]:
# y is 1 if "growth" is greater than 0, 0 otherwise
y = (sales['growth'] > 0).astype(int)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

log = LogisticRegression().fit(x_train, y_train)

log_pred = log.predict(x_test)
print(metrics.accuracy_score(y_test, log_pred))
metrics.confusion_matrix(y_test, log_pred, labels=[0, 1])

0.875


array([[3, 1],
       [0, 4]], dtype=int64)

In [11]:
# y is 1 if "growth" is greater than its median, 0 otherwise
y = (sales['growth'] > sales['growth'].median()).astype(int)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

log = LogisticRegression().fit(x_train, y_train)

log_pred = log.predict(x_test)
print(metrics.accuracy_score(y_test, log_pred))
metrics.confusion_matrix(y_test, log_pred, labels=[0, 1])

0.75


array([[1, 1],
       [1, 5]], dtype=int64)

In [12]:
# y has 3 classes, each class has 12 observations
def class_3(growth):
    if growth < -0.062:
        return 0
    elif growth < -0.007:
        return 1
    else:
        return 2

y = sales['growth'].map(class_3)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

log = LogisticRegression().fit(x_train, y_train)

log_pred = log.predict(x_test)
metrics.accuracy_score(y_test, log_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.75

In [13]:
# y has 4 classes, each class has 9 observations
def class_4(growth):
    if growth < -0.075:
        return 0
    elif growth < -0.03:
        return 1
    elif growth < 0.1:
        return 2
    else:
        return 3
   
y = sales['growth'].map(class_4)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

log = LogisticRegression().fit(x_train, y_train)

log_pred = log.predict(x_test)
metrics.accuracy_score(y_test, log_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.625

## Random Forest Classifier

In [14]:
# y is 1 if "growth" is greater than 0, 0 otherwise
y = (sales['growth'] > 0).astype(int)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

rfc = RandomForestClassifier(
    n_estimators=100, max_depth=2, min_samples_leaf=2, random_state=10)\
    .fit(x_train, y_train)

rfc_pred = rfc.predict(x_test)
print(metrics.accuracy_score(y_test, rfc_pred))
metrics.confusion_matrix(y_test,rfc_pred, labels=[0, 1])

0.875


array([[4, 0],
       [1, 3]], dtype=int64)

In [15]:
# y is 1 if "growth" is greater than its median, 0 otherwise
y = (sales['growth'] > sales['growth'].median()).astype(int)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

rfc = RandomForestClassifier(
    n_estimators=100, max_depth=2, min_samples_leaf=9, random_state=10)\
    .fit(x_train, y_train)

rfc_pred = rfc.predict(x_test)
print(metrics.accuracy_score(y_test, rfc_pred))
metrics.confusion_matrix(y_test,rfc_pred, labels=[0, 1])

0.75


array([[1, 1],
       [1, 5]], dtype=int64)

In [16]:
# y has 3 classes, each class has 12 observations
def class_3(growth):
    if growth < -0.062:
        return 0
    elif growth < -0.007:
        return 1
    else:
        return 2

y = sales['growth'].map(class_3)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

rfc = RandomForestClassifier(
    n_estimators=100, max_depth=2, min_samples_leaf=8, random_state=10)\
    .fit(x_train, y_train)

rfc_pred = rfc.predict(x_test)
metrics.accuracy_score(y_test, rfc_pred)

0.625

In [17]:
# y has 4 classes, each class has 9 observations
def class_4(growth):
    if growth < -0.075:
        return 0
    elif growth < -0.03:
        return 1
    elif growth < 0.1:
        return 2
    else:
        return 3
   
y = sales['growth'].map(class_4)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

rfc = RandomForestClassifier(
    n_estimators=100, max_depth=2, min_samples_leaf=2, random_state=10)\
    .fit(x_train, y_train)

rfc_pred = rfc.predict(x_test)
metrics.accuracy_score(y_test, rfc_pred)

0.5

## XGBoost Classifier

In [18]:
# y is 1 if "growth" is greater than 0, 0 otherwise
y = (sales['growth'] > 0).astype(int)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

xgbc = XGBClassifier().fit(x_train, y_train)

xgbc_pred = xgbc.predict(x_test)

print(metrics.accuracy_score(y_test, xgbc_pred))
metrics.confusion_matrix(y_test, xgbc_pred, labels=[0, 1])

0.875


array([[3, 1],
       [0, 4]], dtype=int64)

In [19]:
# y is 1 if "growth" is greater than its median, 0 otherwise
y = (sales['growth'] > sales['growth'].median()).astype(int)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

xgbc = XGBClassifier().fit(x_train, y_train)

xgbc_pred = xgbc.predict(x_test)

print(metrics.accuracy_score(y_test, xgbc_pred))
metrics.confusion_matrix(y_test, xgbc_pred, labels=[0, 1])

0.625


array([[0, 2],
       [1, 5]], dtype=int64)

In [20]:
# y has 3 classes, each class has 12 observations
def class_3(growth):
    if growth < -0.062:
        return 0
    elif growth < -0.007:
        return 1
    else:
        return 2

y = sales['growth'].map(class_3)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

xgbc = XGBClassifier().fit(x_train, y_train)

xgbc_pred = xgbc.predict(x_test)

print(metrics.accuracy_score(y_test, xgbc_pred))

0.75


In [21]:
# y has 4 classes, each class has 9 observations
def class_4(growth):
    if growth < -0.075:
        return 0
    elif growth < -0.03:
        return 1
    elif growth < 0.1:
        return 2
    else:
        return 3
   
y = sales['growth'].map(class_4)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

xgbc = XGBClassifier().fit(x_train, y_train)

xgbc_pred = xgbc.predict(x_test)

print(metrics.accuracy_score(y_test, xgbc_pred))

0.5
