# Classification to identify price-related tweets

**Objective**

We use the embeddings computed for each tweet and the labels added manually to train a random forest model to detect tweets whether a tweet is related to prices matter.

## Setup

In [60]:
import numpy as np 
import pandas as pd 

In [74]:
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.pipeline import Pipeline

from sklearn.dummy import DummyClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import (
    balanced_accuracy_score, 
    make_scorer,
)

## Data

### Load annotated dataset and embeddings

In [50]:
df_annotated = pd.read_csv("../backup/data/df_annotated.csv")

In [31]:
df_annotated.head()

Unnamed: 0,timestamp,text,emojis,retweets,likes,comments,lexical_field,text_emojis,label
0,2020-01-01,le tout vendu à des prix \n«défiant toute conc...,,,,,cheap,,
1,2020-01-01,Pour ceux qui veulent allier l'utile à l'agréa...,🛑 🛑,,,,cheap,Pour ceux qui veulent allier l'utile à l'agréa...,
2,2020-01-02,"Nouvelle perspective: BARAKA BUSINESS SARL, ne...",,3.0,,,cheap,,not_about_prices
3,2020-01-02,J'ai du mal à savoir si #Cirelli a reçu la Leg...,,3.0,1.0,,cheap,,
4,2020-01-06,Un iPhone pas cher ? C’est maintenant possible...,⭐ 😉,284.0,69.0,36.0,cheap,Un iPhone pas cher ? C’est maintenant possible...,


In [47]:
df_annotated["label"].value_counts()

not_about_prices    684
inflation           302
desinflation         14
other                11
deflation             5
stability             2
Name: label, dtype: int64

In [46]:
def add_topic_variable(label: str) -> float: 
    """Description. 
    Return 1 if label is about prices, 0 is not about prices and nan if no label."""

    if pd.isna(label): 
        return float("nan")
    elif label == "not_about_prices": 
        return 0
    else: 
        return 1

In [51]:
df_annotated["topic"] = df_annotated["label"].apply(add_topic_variable)

In [17]:
embeddings = np.load("../backup/data/embeddings.npy")

In [32]:
assert embeddings.shape[0] == df_annotated.shape[0]

### Train/test split 

In [52]:
annotated_idxs = df_annotated.loc[df_annotated["topic"].notna(), :].index.values.tolist()

In [59]:
df_annotated["topic"][annotated_idxs].value_counts()

0.0    684
1.0    334
Name: topic, dtype: int64

In [120]:
X, y = embeddings, df_annotated["topic"].values

X_train, y_train = X[annotated_idxs, :], y[annotated_idxs]
X_test = np.delete(X, annotated_idxs, axis=0)

## Models

### Define balanced accuracy as metric

In [121]:
balanced_scorer = make_scorer(balanced_accuracy_score)

In [122]:
# instantiate list to save model results

results = []

In [123]:
from typing import List, Dict, Union

def get_model_dict(model_name: str, params: Union[List, Dict], scores: Union[List, float]) -> Dict: 
    """Description. Save model results into dictionnary."""

    return {
        "model": model_name, 
        "params": params, 
        "scores": scores
    }

### `DummyClassifier` benchmark

In [124]:
dummy = DummyClassifier()
scores = cross_val_score(dummy, X_train, y_train, scoring=balanced_scorer, cv=3)

In [125]:
results.append(get_model_dict("DummyClassifier", dummy.get_params(), scores))

### `LogisticRegression`

In [126]:
pipe = Pipeline([
    ("scaler", StandardScaler()), 
    ("logreg", LogisticRegression(max_iter=1000))
])
parameters = {"logreg__C": np.logspace(-8, 8, 17, base=2)} 

log_reg = GridSearchCV(pipe, parameters, cv=3, scoring=balanced_scorer)
log_reg

In [127]:
log_reg.fit(X_train, y_train)

In [128]:
params = log_reg.best_estimator_.get_params()["steps"]
params

[('scaler', StandardScaler()),
 ('logreg', LogisticRegression(C=0.00390625, max_iter=1000))]

In [108]:
scores = log_reg.cv_results_["mean_test_score"]
scores

array([0.75869714, 0.75723444, 0.74407655, 0.72856832, 0.71316523,
       0.70520332, 0.68374131, 0.67281591, 0.66252907, 0.6551401 ,
       0.65440911, 0.65296053, 0.65003655, 0.65153805, 0.65080706,
       0.65157756, 0.65084657])

In [129]:
results.append(get_model_dict("LogisticRegression", params, scores))

### `RandomForestClassifier` (no tuning)

In [130]:
rf = RandomForestClassifier(oob_score=True)
rf.fit(X_train, y_train)

In [131]:
results.append(get_model_dict("RandomForestClassifier", rf.get_params(), rf.oob_score_))

### `GradientBoostingClassifier` (no tuning)

In [132]:
gb = GradientBoostingClassifier()
gb

In [133]:
scores = cross_val_score(gb, X_train, y_train)

In [134]:
results.append(get_model_dict("GradientBoostingClassifier", gb.get_params(), scores))

In [135]:
results

[{'model': 'DummyClassifier',
  'params': {'constant': None, 'random_state': None, 'strategy': 'prior'},
  'scores': array([0.5, 0.5, 0.5])},
 {'model': 'LogisticRegression',
  'params': [('scaler', StandardScaler()),
   ('logreg', LogisticRegression(C=0.00390625, max_iter=1000))],
  'scores': array([0.5, 0.5, 0.5])},
 {'model': 'RandomForestClassifier',
  'params': {'bootstrap': True,
   'ccp_alpha': 0.0,
   'class_weight': None,
   'criterion': 'gini',
   'max_depth': None,
   'max_features': 'sqrt',
   'max_leaf_nodes': None,
   'max_samples': None,
   'min_impurity_decrease': 0.0,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'min_weight_fraction_leaf': 0.0,
   'n_estimators': 100,
   'n_jobs': None,
   'oob_score': True,
   'random_state': None,
   'verbose': 0,
   'warm_start': False},
  'scores': 0.8084479371316307},
 {'model': 'GradientBoostingClassifier',
  'params': {'ccp_alpha': 0.0,
   'criterion': 'friedman_mse',
   'init': None,
   'learning_rate': 0.1,
   'loss