In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sys
import os

from tqdm import tqdm

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, Lasso, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, log_loss

import statsmodels.api as sm

In [29]:
filename_x = './input_data/trainDF.csv'
filename_y = './input_data/trainY.csv'

df = pd.read_csv(filename_x).iloc[:, 1:]
Y = pd.read_csv(filename_y).iloc[:, 1:]

Y = np.argmax(np.array(Y), axis=1)

### Use only X1 - X5 to train a simple logistic regression

In [30]:
# remove history sentiment data

train = df.loc[:, 'X1':"X5"]
x_train, x_valid, y_train, y_valid = train_test_split(train, Y,  
                                                      test_size=0.2)


parameter_grid = {"C":[0.6, 0.7, 0.8, 0.9, 1]}
simple_model = GridSearchCV(LogisticRegression(), param_grid=parameter_grid,
                            n_jobs=-1)
simple_model.fit(x_train, y_train)


print(classification_report(y_valid, simple_model.predict(x_valid)))
print(accuracy_score(y_valid, simple_model.predict(x_valid)))


              precision    recall  f1-score   support

           0       0.33      0.23      0.27       673
           1       0.34      0.40      0.37       721
           2       0.37      0.42      0.39       706

    accuracy                           0.35      2100
   macro avg       0.35      0.35      0.34      2100
weighted avg       0.35      0.35      0.34      2100

0.35


### Use all the sentiment data to train a regression model with l1 reguralization

In [31]:

# get all history sentiment data
sentiment = df.loc[:, 'I1_lag1':'I10_lag47']
train = sentiment
x_train, x_valid, y_train, y_valid = train_test_split(train, Y,  
                                                      test_size=0.2)


parameter_grid = {"alpha":[1e-2, 3e-2, 1e-1, 3e-1, 1]}
simple_model = GridSearchCV(SGDClassifier(penalty='l1', loss='log'), 
                            param_grid=parameter_grid,
                            n_jobs=-1)
simple_model.fit(x_train, y_train)


print(classification_report(y_valid, simple_model.predict(x_valid)))
print(accuracy_score(y_valid, simple_model.predict(x_valid)))

              precision    recall  f1-score   support

           0       0.40      0.07      0.12       695
           1       0.36      0.72      0.48       701
           2       0.36      0.29      0.32       704

    accuracy                           0.36      2100
   macro avg       0.37      0.36      0.30      2100
weighted avg       0.37      0.36      0.31      2100

0.36


### Use X1-X10, max, min, median, mean, trend of sentiment data

In [32]:
def get_trend_of_ts(ts):
    '''
    get the slope of time series, if the slope is not significant, set it to 0
    
    input:
    -----------------
    ts: time series data
    
    
    output:
    -----------------
    trend[float] (slope)
    '''
    x = np.arange(0, len(ts))
    x = sm.add_constant(x)
    model = sm.OLS(ts, x).fit()
    if model.pvalues['x1'] < 0.05:
        return model.params['x1']
    else:
        return 0
    


def get_mean_max_min_median_trend_sentiment(df):
    '''
    get the max value, min value, median value, trend value of all sentiment
    
    input:
    -------------------------------
    df[DataFrame]: input data containing sentiment data
    
    
    output:
    -------------------------------
    max value, min value, median value, trend value of all sentiment
    
    '''
    output = pd.DataFrame()
    sentiment = df.loc[:, 'I1_lag1':'I10_lag47']
    for i in tqdm(range(1, 11)):
        col_name = "I{}".format(str(i))
        start, end = col_name + "_lag1", col_name + "_lag47"
        theme_data = sentiment.loc[:, start:end]
        output[col_name+"_max"] = theme_data.max(axis=1)
        output[col_name+"_min"] = theme_data.min(axis=1)
        output[col_name+"_mean"] = theme_data.mean(axis=1)
        output[col_name+"_median"] = theme_data.median(axis=1)
        output[col_name+"_trend"] = theme_data.apply(get_trend_of_ts, axis=1)
    
    return output


# Get the max, min, median, mean and trend of sentiment
sentiment = get_mean_max_min_median_trend_sentiment(df)
# Concatenate X1-X5 with sentiment feature
train = pd.concat([df.loc[:, 'X1':"X5"], sentiment], axis=1)

# split data into trainning data and validation data
x_train, x_valid, y_train, y_valid = train_test_split(train, Y,  
                                                      test_size=0.2)


parameter_grid = {"C":[0.6, 0.7, 0.8, 0.9, 1]}
simple_model = GridSearchCV(LogisticRegression(), param_grid=parameter_grid,
                            n_jobs=-1)
simple_model.fit(x_train, y_train)


print(classification_report(y_valid, simple_model.predict(x_valid)))
print(accuracy_score(y_valid, simple_model.predict(x_valid)))
print(log_loss(y_valid, simple_model.predict_proba(x_valid)))

100%|██████████████████████████████████████████| 10/10 [01:11<00:00,  7.15s/it]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.45      0.28      0.34       714
           1       0.45      0.68      0.54       716
           2       0.43      0.38      0.40       670

    accuracy                           0.45      2100
   macro avg       0.44      0.44      0.43      2100
weighted avg       0.44      0.45      0.43      2100

0.4461904761904762
1.0592293663015067


### Sentiment, X1-X5 and cross comparison

- Use X1-X5, 
- max, min, median, mean, trend of sentiment data, 
- cross comparison of X1-X5 (E.X. whether X1 > X2)

In [33]:
def add_cross_comparison(df):
    '''
    compare Xi with Xj, if Xi > Xj, then the value of new column "XiXj" is 1, else 0 
    '''
    for i in range(1, 6):
        for j in range(1, 6):
            col1, col2 = "X{}".format(str(i)), "X{}".format(str(j))
            df[col1+col2] = df[col1] > df[col2]
    return df


# Concatenate X1-X5 with sentiment feature
df = pd.concat([df.loc[:, 'X1':"X5"], sentiment], axis=1)
# Add cross comparison feature
df = add_cross_comparison(df)

# split data into trainning data and validation data
x_train, x_valid, y_train, y_valid = train_test_split(df, Y,  
                                                      test_size=0.2)


parameter_grid = {"C":[0.5, 0.6, 0.7, 0.8, 0.9, 1]}
simple_model = GridSearchCV(LogisticRegression(), param_grid=parameter_grid,
                            n_jobs=-1)
simple_model.fit(x_train, y_train)


print(classification_report(y_valid, simple_model.predict(x_valid)))
print(accuracy_score(y_valid, simple_model.predict(x_valid)))
print(log_loss(y_valid, simple_model.predict_proba(x_valid)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.41      0.31      0.36       627
           1       0.53      0.69      0.60       760
           2       0.46      0.40      0.43       713

    accuracy                           0.48      2100
   macro avg       0.47      0.47      0.46      2100
weighted avg       0.47      0.48      0.47      2100

0.48
1.0292395875960232


<a>Output submission file into the desired format</a>

In [34]:
# Read test file
test_filename = './input_data/input_test_PkjtqdQ.csv'
df = pd.read_csv(test_filename).iloc[:, 1:]

# Get the max, min, median, mean and trend of sentiment
sentiment = get_mean_max_min_median_trend_sentiment(df)
df = pd.concat([df.loc[:, 'X1':"X5"], sentiment], axis=1)

# add the cross comparison features
df_with_cross_comparison = add_cross_comparison(df)

# Predict probability of test data
output = simple_model.predict_proba(df_with_cross_comparison)
output = pd.DataFrame(output, columns=['Target -1', 'Target 0', 'Target 1'])
output.to_csv("./output_data/lr_sentiment_crossComparison.csv")


100%|██████████████████████████████████████████| 10/10 [00:33<00:00,  3.36s/it]
