In [1]:
import pandas as pd
import yaml
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
import eli5
import joblib

In [2]:
# directory_shift
%cd ..

/home/sergiusz/Sergiusz_main/Business/Bitcoin_sentiments


### Model artifacts directories management:

In [3]:
directory_shift = ""

with open(directory_shift + "params.yaml") as conf_file:
    config = yaml.safe_load(conf_file)

# data_input:
all_data = config["data_input"]["all_data"]

# model_input:
rnd_seed = config["model_input"]["rnd_seed"]

# model_output:
base_model = config["model_output"]["base_model"]
vectorizer_dir = config["model_output"]["vectorizer"]

### Read the data:

In [4]:
DATA = pd.read_csv(
    all_data, encoding="ISO-8859-1", names=["Sentiment", "News Headline"]
)

### Exploration:

In [5]:
# print(DATA.shape)
# DATA.head()

In [6]:
# DATA[DATA['Sentiment'].isnull()]
# DATA[DATA['News Headline'].isnull()]

# There is no NaNs

In [7]:
# overfitting_management
train_share = 0.8
rule_of_ten = int(DATA[DATA.Sentiment == "negative"].shape[0] / 10 * train_share)
print(rule_of_ten)

DATA.Sentiment.value_counts() / DATA.shape[0]
# DATA.Sentiment.value_counts()

48


neutral     0.594098
positive    0.281263
negative    0.124639
Name: Sentiment, dtype: float64

### Labeling

In [8]:
keys = ["neutral", "positive", "negative"]
vals = [0, 1, 2]
map_dict = dict(zip(keys, vals))

DATA.Sentiment = DATA.Sentiment.map(map_dict)

### Train-test split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    DATA["News Headline"],
    DATA.Sentiment,
    test_size=0.20,
    random_state=rnd_seed,
    shuffle=True,
    stratify=DATA.Sentiment,
)

del DATA

### tf-idf

In [10]:
vectorizer = TfidfVectorizer(
    stop_words="english", ngram_range=(1, 2), lowercase=True, max_features=4000
)

In [11]:
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [12]:
X_train_vectorized.shape, X_test_vectorized.shape

((3876, 4000), (970, 4000))

### LogReg:

In [13]:
logreg = LogisticRegression(
    C=0.01,
    solver="lbfgs",
    multi_class="multinomial",
    class_weight="balanced",
    random_state=rnd_seed,
    n_jobs=-1,
)

### Cross-validation:

In [14]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=rnd_seed)

In [15]:
crossval_res = cross_val_score(
    logreg, X_train_vectorized, y_train, cv=skf, scoring="f1_micro"
)

In [16]:
crossval_res, crossval_res.mean(), crossval_res.std()

(array([0.64175258, 0.68      , 0.66967742, 0.6516129 , 0.66064516]),
 0.6607376122381111,
 0.013376547732267822)

### Model_func_estimation:

In [17]:
logreg.fit(X_train_vectorized, y_train)

### Model interpretation:

In [18]:
keys = [0, 1, 2]
vals = ["neutral", "positive", "negative"]
y_dict = dict(zip(keys, vals))

eli5.show_weights(
    estimator=logreg,
    target_names=y_dict,
    feature_names=list(vectorizer.get_feature_names()),
    top=(25, 10),
)



Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+0.056,business,
+0.053,<BIAS>,
+0.052,value,
+0.050,new,
+0.049,shares,
+0.044,approximately,
+0.041,services,
+0.039,deal,
+0.039,includes,
+0.038,disclosed,

Weight?,Feature
+0.056,business
+0.053,<BIAS>
+0.052,value
+0.050,new
+0.049,shares
+0.044,approximately
+0.041,services
+0.039,deal
+0.039,includes
+0.038,disclosed

Weight?,Feature
+0.116,rose
+0.108,increase
+0.083,increased
+0.071,signed
+0.069,year
+0.065,grew
+0.059,agreement
+0.056,profit rose
+0.054,improved
+0.054,awarded

Weight?,Feature
+0.245,mn
+0.234,eur
+0.189,decreased
+0.163,eur mn
+0.152,profit
+0.129,fell
+0.123,quarter
+0.120,period
+0.116,compared
+0.115,loss


### Prediction test:

In [19]:
arg = ["Something important happened in first quarter of 2009"]
X = vectorizer.transform(arg)
logreg.predict(X)

array([2])

### Save our model:

In [20]:
joblib.dump(logreg, base_model)
joblib.dump(vectorizer, vectorizer_dir)

['model/vectorizer.pkl']

In [21]:
joblib_model = joblib.load(base_model)
joblib_vectorizer = joblib.load(vectorizer_dir)

In [22]:
# arg = ['Something important happened in first quarter of 2009']
# X = joblib_vectorizer.transform(arg)
# joblib_model.predict(X)

array([2])