In [1]:
import pandas as pd
import yaml
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
import eli5

# import pickle

In [2]:
# directory_shift
%cd ..

/home/sergiusz/Sergiusz_main/Business/Bitcoin_sentiments


### Model artifacts directories management:

In [3]:
directory_shift = ""

with open(directory_shift + "params.yaml") as conf_file:
    config = yaml.safe_load(conf_file)

# data_input:
all_data = config["data_input"]["all_data"]

# model_output:
base_model = config["model_output"]["base_model"]

# reproducibility:
rnd_seed = config["reproducibility"]["rnd_seed"]

### Read the data:

In [4]:
DATA = pd.read_csv(
    all_data, encoding="ISO-8859-1", names=["Sentiment", "News Headline"]
)

### Exploration:

In [5]:
# print(DATA.shape)
# DATA.head()

In [6]:
# DATA[DATA['Sentiment'].isnull()]
# DATA[DATA['News Headline'].isnull()]

# There is no NaNs

In [7]:
# overfitting_management
train_share = 0.8
rule_of_ten = int(DATA[DATA.Sentiment == "negative"].shape[0] / 10 * train_share)
print(rule_of_ten)

DATA.Sentiment.value_counts() / DATA.shape[0]
# DATA.Sentiment.value_counts()

48


neutral     0.594098
positive    0.281263
negative    0.124639
Name: Sentiment, dtype: float64

### Labeling

In [8]:
keys = ["neutral", "positive", "negative"]
vals = [0, 1, 2]
map_dict = dict(zip(keys, vals))

DATA.Sentiment = DATA.Sentiment.map(map_dict)

### Train-test split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    DATA["News Headline"],
    DATA.Sentiment,
    test_size=0.20,
    random_state=rnd_seed,
    shuffle=True,
    stratify=DATA.Sentiment,
)

del DATA

### tf-idf

In [10]:
vectorizer = TfidfVectorizer(
    stop_words="english", ngram_range=(2, 2), lowercase=True, max_features=8000
)

In [11]:
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [12]:
X_train_vectorized.shape, X_test_vectorized.shape

((3876, 8000), (970, 8000))

### LogReg:

In [13]:
logreg = LogisticRegression(
    C=0.01,
    solver="lbfgs",
    multi_class="multinomial",
    class_weight="balanced",
    random_state=rnd_seed,
    n_jobs=-1,
)

### Cross-validation:

In [14]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=rnd_seed)

In [15]:
crossval_res = cross_val_score(
    logreg, X_train_vectorized, y_train, cv=skf, scoring="f1_micro"
)

In [16]:
crossval_res, crossval_res.mean(), crossval_res.std()

(array([0.64046392, 0.66580645, 0.65935484, 0.66322581, 0.66580645]),
 0.658931493182574,
 0.009531020987964873)

### Model_func_estimation:

In [17]:
logreg.fit(X_train_vectorized, y_train)

### Model interpretation:

In [18]:
keys = [0, 1, 2]
vals = ["neutral", "positive", "negative"]
y_dict = dict(zip(keys, vals))

eli5.show_weights(
    estimator=logreg,
    target_names=y_dict,
    feature_names=list(vectorizer.get_feature_names()),
    top=(25, 10),
)



Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+0.033,financial details,
+0.028,share capital,
+0.026,voting rights,
+0.024,alma media,
+0.020,<BIAS>,
+0.020,general meeting,
+0.020,alexandria va,
+0.019,company added,
+0.018,product development,
+0.017,new shares,

Weight?,Feature
+0.033,financial details
+0.028,share capital
+0.026,voting rights
+0.024,alma media
+0.020,<BIAS>
+0.020,general meeting
+0.020,alexandria va
+0.019,company added
+0.018,product development
+0.017,new shares

Weight?,Feature
+0.072,profit rose
+0.068,rose eur
+0.052,operating profit
+0.039,period increased
+0.033,long term
+0.032,increased eur
+0.031,sales increase
+0.030,signed agreement
+0.029,mn eur
+0.028,increased respectively

Weight?,Feature
+0.230,eur mn
+0.113,corresponding period
+0.100,decreased eur
+0.091,profit eur
+0.089,mn eur
+0.085,operating loss
+0.080,compared profit
+0.077,operating profit
+0.075,totalled eur
+0.075,sales decreased


### Save our model to .pkl: