In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sys
import os

from tqdm import tqdm

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, Lasso, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, log_loss

import statsmodels.api as sm

In [2]:
filename_x = './input_data/trainDF.csv'
filename_y = './input_data/trainY.csv'

df = pd.read_csv(filename_x).iloc[:, 1:]
Y = pd.read_csv(filename_y).iloc[:, 1:]

Y = np.argmax(np.array(Y), axis=1)

### Use only X1 - X5 to train a simple logistic regression

In [3]:
train = df.loc[:, 'X1':"X5"]
x_train, x_valid, y_train, y_valid = train_test_split(train, Y,  
                                                      test_size=0.2)


parameter_grid = {"C":[0.6, 0.7, 0.8, 0.9, 1]}
simple_model = GridSearchCV(LogisticRegression(), param_grid=parameter_grid,
                            n_jobs=-1)
simple_model.fit(x_train, y_train)


print(classification_report(y_valid, simple_model.predict(x_valid)))
print(accuracy_score(y_valid, simple_model.predict(x_valid)))


              precision    recall  f1-score   support

           0       0.40      0.18      0.25       693
           1       0.35      0.45      0.39       709
           2       0.36      0.45      0.40       698

    accuracy                           0.36      2100
   macro avg       0.37      0.36      0.35      2100
weighted avg       0.37      0.36      0.35      2100

0.36047619047619045


### Use all the sentiment data to train a regression model with l1 reguralization

In [4]:
sentiment = df.loc[:, 'I1_lag1':'I10_lag47']
train = sentiment
x_train, x_valid, y_train, y_valid = train_test_split(train, Y,  
                                                      test_size=0.2)


parameter_grid = {"alpha":[1e-2, 3e-2, 1e-1, 3e-1, 1]}
simple_model = GridSearchCV(SGDClassifier(penalty='l1', loss='log'), 
                            param_grid=parameter_grid,
                            n_jobs=-1)
simple_model.fit(x_train, y_train)


print(classification_report(y_valid, simple_model.predict(x_valid)))
print(accuracy_score(y_valid, simple_model.predict(x_valid)))

              precision    recall  f1-score   support

           0       0.32      0.16      0.21       638
           1       0.39      0.55      0.45       743
           2       0.37      0.38      0.38       719

    accuracy                           0.37      2100
   macro avg       0.36      0.36      0.35      2100
weighted avg       0.36      0.37      0.35      2100

0.3719047619047619


### Use X1-X10, max, min, median, mean, trend of sentiment data

In [9]:
def get_trend_of_ts(ts):
    '''
    get the slope of time series, if the slope is not significant, set it to 0
    
    input:
    -----------------
    ts: time series data
    
    
    output:
    -----------------
    trend[float] (slope)
    '''
    x = np.arange(0, len(ts))
    x = sm.add_constant(x)
    model = sm.OLS(ts, x).fit()
    if model.pvalues['x1'] < 0.05:
        return model.params['x1']
    else:
        return 0
    


def get_mean_max_min_median_trend_sentiment(df):
    '''
    get the max value, min value, median value, trend value of all sentiment
    
    input:
    -------------------------------
    df[DataFrame]: input data containing sentiment data
    
    
    output:
    -------------------------------
    max value, min value, median value, trend value of all sentiment
    
    '''
    output = pd.DataFrame()
    sentiment = df.loc[:, 'I1_lag1':'I10_lag47']
    for i in tqdm(range(1, 11)):
        col_name = "I{}".format(str(i))
        start, end = col_name + "_lag1", col_name + "_lag47"
        theme_data = sentiment.loc[:, start:end]
        output[col_name+"_max"] = theme_data.max(axis=1)
        output[col_name+"_min"] = theme_data.min(axis=1)
        output[col_name+"_mean"] = theme_data.mean(axis=1)
        output[col_name+"_median"] = theme_data.median(axis=1)
        output[col_name+"_trend"] = theme_data.apply(get_trend_of_ts, axis=1)
    
    return output


sentiment = get_mean_max_min_median_trend_sentiment(df)
train = pd.concat([df.loc[:, 'X1':"X5"], sentiment], axis=1)

x_train, x_valid, y_train, y_valid = train_test_split(train, Y,  
                                                      test_size=0.2)


parameter_grid = {"C":[0.6, 0.7, 0.8, 0.9, 1]}
simple_model = GridSearchCV(LogisticRegression(), param_grid=parameter_grid,
                            n_jobs=-1)
simple_model.fit(x_train, y_train)


print(classification_report(y_valid, simple_model.predict(x_valid)))
print(accuracy_score(y_valid, simple_model.predict(x_valid)))
print(log_loss(y_valid, simple_model.predict_proba(x_valid)))

100%|██████████████████████████████████████████| 10/10 [01:12<00:00,  7.22s/it]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.39      0.30      0.34       639
           1       0.50      0.68      0.58       738
           2       0.43      0.36      0.39       723

    accuracy                           0.45      2100
   macro avg       0.44      0.45      0.44      2100
weighted avg       0.44      0.45      0.44      2100

0.45476190476190476
1.0401394041283194
