In [11]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pickle
from tqdm import tqdm

import warnings 
warnings.filterwarnings("ignore", category=Warning)

In [2]:
with open('../data/tweet_data.pkl', 'rb') as f:
    df = pickle.load(f)

In [3]:
df.shape

(49159, 8)

In [4]:
train = df.loc[df.label.isna() == False]
test = df.loc[df.label.isna() == True]
train.shape, test.shape

((31962, 8), (17197, 8))

In [5]:
def get_embedding(data: pd.Series,
                  vectorizer: callable,
                  parameters: dict = {'max_df': 0.9,
                                      'max_features': 1000,
                                      'stop_words': 'english'}) -> pd.DataFrame:
    
    """
    Создание Bag-of-Words эмбеддингов.

    Parameters
    ----------
    data: pd.Series
        Последовательность списков токенов. 
        Данные для получения эмбеддинга

    vectorizer: callable
        Vectorizer для получения Bag-of-words.
        Ожидается CountVectorizer или TfIdfVectorizer.
    
    parameters: dict
        Параметры модели.
        Опциональный параметр, по умолчанию заданы базовые настройки.


    Returns
    -------
    pd.DataFrame
        Разреженная матрица токенизиорованных эмбеддингов для использования в моделях.

    """
    
    model = vectorizer(**parameters)
    values = model.fit_transform(data.astype('str'))
    feature_names = model.get_feature_names()
    
    return pd.DataFrame(values.toarray(), columns = feature_names)


def eval_model(model: callable,
               X_train: pd.DataFrame,
               y_train: pd.Series,
               X_test: pd.DataFrame,
               y_test: pd.Series,
               eval_metric: callable = f1_score) -> float:
    
    """
    Оценка качеcтва модели.

    Parameters
    ----------
    model: callabe
        Инициализированная модель для обучения.
    
    X_train, X_test: pandas.DataFrame
    
    Разреженные матрицы токенизированных эмбеддингов.
    Получение из сплитов в функции get_embedding().
    
    y_train, y_test:
    
    Размеченные сплиты меток классов.
    
    eval_metric: callable
        Метрика качества.
        Опциональный параметр, по умолчанию используется f1_score.

    Returns
    -------
    score: float
        Расчетное качество по заданной метрике
    """
    
    model.fit(X_train, y_train)
    pred_prob = model.predict_proba(X_test)
    y_pred = np.where(pred_prob[:,1]>0.3, 1,0)
    score = f1_score(y_test, y_pred)
    
    return score

def get_set_embeddings (X_train: pd.Series) -> pd.DataFrame:
    
    """
    Создание 4 комплектов эмбедингов.

    Parameters
    ----------
    X_train: pd.Series
        Последовательность списков токенов. 
        Данные для получения эмбеддинга    



    Returns
    -------
    pd.DataFrame x 4
        Stemmed Count Vectorized embedds
        Lemmatized Count Vectorized embedds
        Stemmed TfIdf embedds
        Lemmatized TfIdf embedds

    """
    X_train_stemmed_CV = get_embedding(data=X_train[X_train.columns[0]], vectorizer=CountVectorizer)
    X_train_lemmatized_CV = get_embedding(data=X_train[X_train.columns[1]], vectorizer=CountVectorizer)
    X_train_stemmed_tfidf = get_embedding(data=X_train[X_train.columns[0]], vectorizer=TfidfVectorizer)
    X_train_lemmatized_tfidf = get_embedding(data=X_train[X_train.columns[1]], vectorizer=TfidfVectorizer)
    
    return X_train_stemmed_CV, X_train_lemmatized_CV, X_train_stemmed_tfidf, X_train_lemmatized_tfidf

In [6]:
X = train[['tweet_stemmed', 'tweet_lemmatized']]
y = train['label']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=17,
    shuffle=True,
    stratify=y
)

In [9]:
X_train_stemmed_CV, X_train_lemmatized_CV,\
X_train_stemmed_tfidf, X_train_lemmatized_tfidf = get_set_embeddings (X_train)

X_test_stemmed_CV, X_test_lemmatized_CV,\
X_test_stemmed_tfidf, X_test_lemmatized_tfidf = get_set_embeddings (X_test)

In [12]:
clf = LogisticRegression()

F1_CS = eval_model(clf, X_train_stemmed_CV, y_train, X_test_stemmed_CV, y_test)
F1_CL = eval_model(clf, X_train_lemmatized_CV, y_train, X_test_lemmatized_CV, y_test)
F1_TS = eval_model(clf, X_train_stemmed_tfidf, y_train, X_test_stemmed_tfidf, y_test)
F1_TL = eval_model(clf, X_train_lemmatized_tfidf, y_train, X_test_lemmatized_tfidf, y_test)

print(f"F1-score: Count Vectorizer Stemmed - {round(F1_CS, 3)}")
print(f"F1-score: Count Vectorizer Lemmatized - {round(F1_CL, 3)}")
print(f"F1-score: TfIdf Vectorizer Stemmed - {round(F1_TS, 3)}")
print(f"F1-score: TfIdf Vectorizer Lemmatized - {round(F1_TL, 3)}")

F1-score: Count Vectorizer Stemmed - 0.057
F1-score: Count Vectorizer Lemmatized - 0.094
F1-score: TfIdf Vectorizer Stemmed - 0.047
F1-score: TfIdf Vectorizer Lemmatized - 0.093
