In [40]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
with open('corpus') as f:
    data = f.read()
    
labels, texts = [], []

for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])
    texts.append(" ".join(content[1:]))

# создаем df
trainDF = pd.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels
trainDF.head(5)

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,__label__2
1,The best soundtrack ever to anything.: I'm rea...,__label__2
2,Amazing!: This soundtrack is my favorite music...,__label__2
3,Excellent Soundtrack: I truly like this soundt...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After He...",__label__2


In [5]:
my_data = pd.read_csv('tweet_preprocessed.csv')
my_data.head()

Unnamed: 0.1,Unnamed: 0,id,label,tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,0,1,0.0,when father is dysfunctional and is so sel...,"['when', 'father', 'is', 'dysfunctional', 'and...","['father', 'dysfunctional', 'selfish', 'drags'...","['father', 'dysfunct', 'selfish', 'drag', 'kid...","['father', 'dysfunct', 'selfish', 'drag', 'kid..."
1,1,2,0.0,thanks for lyft credit cannot use cause t...,"['thanks', 'for', 'lyft', 'credit', 'can', 'no...","['thanks', 'lyft', 'credit', 'use', 'cause', '...","['thank', 'lyft', 'credit', 'use', 'caus', 'of...","['thank', 'lyft', 'credit', 'use', 'cau', 'off..."
2,2,3,0.0,bihday your majesty,"['bihday', 'your', 'majesty']","['bihday', 'majesty']","['bihday', 'majesti']","['bihday', 'majesti']"
3,3,4,0.0,model love you take with you all the time...,"['model', 'love', 'you', 'take', 'with', 'you'...","['model', 'love', 'take', 'time', 'ur']","['model', 'love', 'take', 'time', 'ur']","['model', 'love', 'take', 'time', 'ur']"
4,4,5,0.0,factsguide society now motivation,"['factsguide', 'society', 'now', 'motivation']","['factsguide', 'society', 'motivation']","['factsguid', 'societi', 'motiv']","['factsguid', 'societi', 'motiv']"


In [10]:
#help(CountVectorizer)

In [75]:
count_vectorizer_stemmed = CountVectorizer(max_df=0.9, max_features=1000, stop_words='english')
count_vectorizer_lemmatized = CountVectorizer(max_df=0.9, max_features=1000, stop_words='english')

# Создаем the Bag-of-Words модель
bag_of_words_stemmed = count_vectorizer_stemmed.fit_transform(my_data['tweet_stemmed'])
bag_of_words_lemmatized = count_vectorizer_lemmatized.fit_transform(my_data['tweet_lemmatized'])

# Отобразим Bag-of-Words модель как DataFrame
feature_names = count_vectorizer_stemmed.get_feature_names()
stemmed_df = pd.DataFrame(bag_of_words_stemmed.toarray(), columns = feature_names)

feature_names = count_vectorizer_lemmatized.get_feature_names()
lemmatized_df = pd.DataFrame(bag_of_words_lemmatized.toarray(), columns = feature_names)



In [76]:
stemmed_df.head()

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
lemmatized_df.head()

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [90]:
def get_data_frame(df_to_transform, df_to_fit, max_features, vectorizer, is_pca):
    
    count_vectorizer = vectorizer(max_df=0.9, max_features=max_features, stop_words='english')
    count_vectorizer.fit(df_to_fit)
    bag_of_words = count_vectorizer.transform(df_to_transform)
    feature_names = count_vectorizer.get_feature_names()
    df = pd.DataFrame(bag_of_words.toarray(), columns = feature_names)
    if is_pca:
        pca = PCA(n_components=10)
        df = pd.DataFrame(pca.fit_transform(df))
    return df

In [91]:
is_pca=False

In [94]:
get_data_frame(my_data['tweet_stemmed'], my_data['tweet_stemmed'], 100, CountVectorizer, is_pca).head()



Unnamed: 0,affirm,altwaystoh,alway,amaz,amp,beauti,best,bihday,bless,blog,...,wait,want,watch,way,week,weekend,wish,work,world,year
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
get_data_frame(my_data['tweet_stemmed'], my_data['tweet_stemmed'], 1000, CountVectorizer, True).head()



Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.177644,-0.222372,0.028218,-0.14126,0.004197,-0.064297,0.036597,-0.062871,-0.006709,0.010797
1,-0.238328,-0.057656,0.79219,0.253301,0.01675,0.027463,0.002798,-0.012105,-0.012325,0.001211
2,-0.113952,-0.040541,-0.074555,0.079875,-0.016419,-0.095582,-0.048511,-0.07446,0.217742,0.013827
3,0.457645,1.118515,0.061498,-0.211784,0.005641,-0.085817,0.008702,-0.965389,-0.438675,0.633059
4,-0.150892,-0.034765,-0.037703,-0.036624,-0.013146,-0.077342,-0.022003,-0.024303,0.016978,-0.024908


In [95]:
train_x_idx, valid_x_idx, train_y_idx, valid_y_idx = train_test_split(range(len(trainDF['text'])), 
                                                      range(len(trainDF['label'])), 
                                                      test_size=0.2,
                                                      random_state=128)

# labelEncode целевую переменную
encoder = LabelEncoder()
train_y = encoder.fit_transform(trainDF['label'][train_y_idx])
valid_y = encoder.fit_transform(trainDF['label'][valid_y_idx])

In [98]:
errors_dict = {}
for vectorizer in (CountVectorizer, TfidfVectorizer):
    for is_pca in (False, True):
        for col_to_fit in ('tweet_stemmed', 'tweet_lemmatized'):
            for max_features in range(100, 2000, 200):
                df = get_data_frame(trainDF['text'], my_data[col_to_fit], max_features, vectorizer, is_pca)
                #print(df)
                classifier = LogisticRegression()
                classifier.fit(df.iloc[train_x_idx], train_y)
                predictions = classifier.predict(df.iloc[valid_x_idx])
                errors_dict[f'vectorizer {vectorizer}, is_pca {is_pca}, col_to_fit {col_to_fit}, max_features {max_features}'] = accuracy_score(valid_y, predictions)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt





In [100]:
pd.DataFrame.from_dict(errors_dict, orient='index', columns=['acc_score']).sort_values(by='acc_score', ascending=False)

Unnamed: 0,acc_score
"vectorizer <class 'sklearn.feature_extraction.text.TfidfVectorizer'>, is_pca False, col_to_fit tweet_stemmed, max_features 1700",0.7655
"vectorizer <class 'sklearn.feature_extraction.text.TfidfVectorizer'>, is_pca False, col_to_fit tweet_stemmed, max_features 1900",0.7655
"vectorizer <class 'sklearn.feature_extraction.text.TfidfVectorizer'>, is_pca False, col_to_fit tweet_lemmatized, max_features 1700",0.7655
"vectorizer <class 'sklearn.feature_extraction.text.TfidfVectorizer'>, is_pca False, col_to_fit tweet_lemmatized, max_features 1900",0.7650
"vectorizer <class 'sklearn.feature_extraction.text.TfidfVectorizer'>, is_pca False, col_to_fit tweet_stemmed, max_features 1500",0.7585
...,...
"vectorizer <class 'sklearn.feature_extraction.text.CountVectorizer'>, is_pca True, col_to_fit tweet_lemmatized, max_features 100",0.6575
"vectorizer <class 'sklearn.feature_extraction.text.TfidfVectorizer'>, is_pca True, col_to_fit tweet_stemmed, max_features 900",0.6570
"vectorizer <class 'sklearn.feature_extraction.text.TfidfVectorizer'>, is_pca True, col_to_fit tweet_lemmatized, max_features 900",0.6565
"vectorizer <class 'sklearn.feature_extraction.text.TfidfVectorizer'>, is_pca True, col_to_fit tweet_lemmatized, max_features 100",0.6475


Я думаю что так вышло из-за того, что я взял corpus из урока, а учил на своих данных))) в уроке корпус не прошел лематизацию и стемминг, что привело к тому что в нем почти не было слов, на которых учились CountVectorizer и TfidfVectorizer.

P.S Надеюсь я правильно понял требования к дз.