In [1]:
import pandas as pd
from hw2.metrictool import MetricRegressionManager

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
PATH_PREPROCESSING = 'hw1\df_preprocessing.pkl'

TWEET = 'tweet'
TWEET_CLEAN = 'clean_tweet'
TWEET_TOKENIZE = 'tweet_token'
TWEET_FILTERED = 'tweet_token_filtered'
TWEET_STEMMED = 'tweet_stemmed'
TWEET_LEMMATIZED = 'tweet_lemmatized'

In [4]:
metric_manager = MetricRegressionManager()

In [5]:
df:pd.DataFrame = pd.read_pickle(PATH_PREPROCESSING)
df.head()

Unnamed: 0,id,label,tweet,clean_tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thank, lyft, credit, use, cause, offer, wheel..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesty]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguide, society, motivation]"


### Для 'tweet_stemmed' и 'tweet_lemmatized' создадим мешок слов с помощью CountVectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

def create_bov_countvectorizer(name_column:str, max_features:int=1000, max_df:float=0.9, ngram_range=(1,1)):
    vectorizer = CountVectorizer(
        max_df=max_df,
        max_features=max_features,
        ngram_range=ngram_range,
        stop_words='english'
        )
    bow = vectorizer.fit_transform(list(map(lambda text: ' '.join(text), df[name_column])))

    feature_names = vectorizer.get_feature_names_out()
    #print(pd.DataFrame(bow.toarray(), columns = feature_names).head(5))   
    return vectorizer, bow 

In [7]:
cov_stemmed, bow_cov_stemmed = create_bov_countvectorizer(TWEET_STEMMED)

In [8]:
cov_lemmatized, bow_cov_lemmatized = create_bov_countvectorizer(TWEET_LEMMATIZED)

### Для 'tweet_stemmed' и 'tweet_lemmatized' создадим мешок слов с помощью TfidfVectorizer

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

def create_bov_tfidfvectorizer(name_column:str, max_features:int=1000, max_df:float=0.9, ngram_range=(1,1)):
    vectorizer = TfidfVectorizer(
        max_df=max_df,
        max_features=max_features,
        ngram_range = ngram_range,
        stop_words='english'
        )
    bow = vectorizer.fit_transform(list(map(lambda text: ' '.join(text), df[name_column])))

    feature_names = vectorizer.get_feature_names_out()
    #print(pd.DataFrame(bow.toarray(), columns = feature_names).head(5))
    return vectorizer, bow

In [10]:
tfidf_stemmed, bow_tfidf_stemmed = create_bov_tfidfvectorizer(TWEET_STEMMED)

In [11]:
tfidf_lemmatized, bow_tfidf_lemmatized = create_bov_tfidfvectorizer(TWEET_LEMMATIZED)

### Проверим векторайзеры на корпусе

In [12]:
# Загружаем данные
data = open('hw2/corpus').read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])
    texts.append(" ".join(content[1:]))

# создаем df
trainDF = pd.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels
trainDF.head(5)

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,__label__2
1,The best soundtrack ever to anything.: I'm rea...,__label__2
2,Amazing!: This soundtrack is my favorite music...,__label__2
3,Excellent Soundtrack: I truly like this soundt...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After He...",__label__2


In [13]:
from sklearn import model_selection, preprocessing, linear_model

x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['text'], trainDF['label'])
# labelEncode целевую переменную
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_valid = encoder.fit_transform(y_valid)

In [14]:
from sklearn.decomposition import TruncatedSVD

def fit_model(vectorizer, name:str, use_svd:bool = False):
    xtrain_vectorizing = vectorizer.transform(x_train)
    xvalid_vectorizing = vectorizer.transform(x_valid)
    
    if use_svd:
        svd = TruncatedSVD(n_components=200, random_state=42)
        xtrain_vectorizing = svd.fit_transform(xtrain_vectorizing)
        xvalid_vectorizing = svd.transform(xvalid_vectorizing)

    classifier = linear_model.LogisticRegression()
    classifier.fit(xtrain_vectorizing, y_train)
    y_pred = classifier.predict_proba(xvalid_vectorizing)[:,1]
    
    metric_manager.apply(name, pd.Series(y_valid), y_pred)

In [15]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])
fit_model(count_vect, 'lection base')

fit_model(cov_stemmed, 'count_vectorizer stemmed base')
fit_model(cov_lemmatized, 'count_vectorizer lemmatized base')

fit_model(tfidf_stemmed, 'tfidf stemmed base')
fit_model(tfidf_lemmatized, 'tfidf lemmatized base')

In [16]:
#max_features=800
fit_model(create_bov_countvectorizer(TWEET_STEMMED, max_features=800)[0], 'count_vectorizer stemmed max_features=800')
fit_model(create_bov_countvectorizer(TWEET_LEMMATIZED, max_features=800)[0], 'count_vectorizer lemmatized max_features=800')

fit_model(create_bov_tfidfvectorizer(TWEET_STEMMED, max_features=800)[0], 'tfidf stemmed max_features=800')
fit_model(create_bov_tfidfvectorizer(TWEET_LEMMATIZED, max_features=800)[0], 'tfidf lemmatized max_features=800')

#max_features=600
fit_model(create_bov_countvectorizer(TWEET_STEMMED, max_features=600)[0], 'count_vectorizer stemmed max_features=600')
fit_model(create_bov_countvectorizer(TWEET_LEMMATIZED, max_features=600)[0], 'count_vectorizer lemmatized max_features=600')

fit_model(create_bov_tfidfvectorizer(TWEET_STEMMED, max_features=600)[0], 'tfidf stemmed max_features=600')
fit_model(create_bov_tfidfvectorizer(TWEET_LEMMATIZED, max_features=600)[0], 'tfidf lemmatized max_features=600')

#max_df=0.8
fit_model(create_bov_countvectorizer(TWEET_STEMMED, max_df=0.8)[0], 'count_vectorizer stemmed max_df=0.8')
fit_model(create_bov_countvectorizer(TWEET_LEMMATIZED, max_df=0.8)[0], 'count_vectorizer lemmatized max_df=0.8')

fit_model(create_bov_tfidfvectorizer(TWEET_STEMMED, max_df=0.8)[0], 'tfidf stemmed max_df=0.8')
fit_model(create_bov_tfidfvectorizer(TWEET_LEMMATIZED, max_df=0.8)[0], 'tfidf lemmatized max_df=0.8')

#ngram_range=(2,2)
fit_model(create_bov_countvectorizer(TWEET_STEMMED, ngram_range=(2,2))[0], 'count_vectorizer stemmed ngram_range=(2,2)')
fit_model(create_bov_countvectorizer(TWEET_LEMMATIZED, ngram_range=(2,2))[0], 'count_vectorizer lemmatized ngram_range=(2,2)')

fit_model(create_bov_tfidfvectorizer(TWEET_STEMMED, ngram_range=(2,2))[0], 'tfidf stemmed ngram_range=(2,2)')
fit_model(create_bov_tfidfvectorizer(TWEET_LEMMATIZED, ngram_range=(2,2))[0], 'tfidf lemmatized ngram_range=(2,2)')

#ngram_range=(1,2)
fit_model(create_bov_countvectorizer(TWEET_STEMMED, ngram_range=(1,2))[0], 'count_vectorizer stemmed ngram_range=(1,2)')
fit_model(create_bov_countvectorizer(TWEET_LEMMATIZED, ngram_range=(1,2))[0], 'count_vectorizer lemmatized ngram_range=(1,2)')

fit_model(create_bov_tfidfvectorizer(TWEET_STEMMED, ngram_range=(1,2))[0], 'tfidf stemmed ngram_range=(1,2)')
fit_model(create_bov_tfidfvectorizer(TWEET_LEMMATIZED, ngram_range=(1,2))[0], 'tfidf lemmatized ngram_range=(1,2)')

# SVD
fit_model(cov_stemmed, 'count_vectorizer stemmed svd', use_svd=True)
fit_model(cov_lemmatized, 'count_vectorizer lemmatized svd', use_svd=True)

fit_model(tfidf_stemmed, 'tfidf stemmed svd', use_svd=True)
fit_model(tfidf_lemmatized, 'tfidf lemmatized svd', use_svd=True)

In [18]:
metric_manager.show_table_report()
metric_manager.show_united_auc_interactive()
metric_manager.show_united_auc_interactive_v2()

Name model                                       Threshold    F-Score    Precision    Recall    Accuracy    Roc-AUC    f1-score(macro)
---------------------------------------------  -----------  ---------  -----------  --------  ----------  ---------  -----------------
lection base                                         0.459      0.858        0.851     0.865       0.863      0.932              0.863
count_vectorizer stemmed base                        0.39       0.751        0.676     0.845       0.732      0.82               0.731
count_vectorizer lemmatized base                     0.413      0.772        0.715     0.84        0.763      0.85               0.763
tfidf stemmed base                                   0.398      0.762        0.683     0.86        0.742      0.838              0.741
tfidf lemmatized base                                0.438      0.78         0.727     0.841       0.773      0.863              0.773
count_vectorizer stemmed max_features=800            0.

#### Выводы:
1. Count_vectorizer и Tfidf показали в целом одинаковые результаты незначительно опережая друг друга при повторных обучениях
2. Lemmatized всегда выше stemmed но не значительно
3. Понижение max_features нелинейно ухудшает точность класификации
4. Понижение max_df нелинейно ухудшает точность класификации
5. Изучение модели только на словосочетаниях значительно хуже чем анализировать текс по одному слову
6. Изучение модели на словосочетаниях и по словам не даёт заметного улучшения класификации
7. Используя SVD и сжимая количество фич в 5 раз показатели качества модели падают незначительно 