In [1]:
import pandas as pd
df_commercial = pd.read_csv('commercial_lemmatized.csv', names=['status', 'text'], header=None)
df_commercial_def = pd.read_csv('commercial_definition.csv', names=['status', 'cat_name'], header=None)
df_posts = pd.read_csv('lemmatized.csv', names=['index', 'text'], header=None)
df_categories = pd.read_csv('categories.csv', names=['index', 'cat_name'], header=None)


In [2]:
df_posts.head()

Unnamed: 0,index,text
0,13,рассказывать источник поле зрение федеральный ...
1,1,инсайдер сообщать лидер партия единый россия д...
2,2,ноябрь ростов временный торговый точка аксайск...
3,5,роснано оказываться грань дефолт топ менеджер ...
4,1,свежий выступление председатель ск быстрыкин н...


In [24]:
df_categories.head()

Unnamed: 0,index,cat_name
0,1,Политика
1,2,Экономика и бизнес
2,3,Телеком
3,4,Компромат
4,5,Биржа и финансы


In [25]:
df_commercial.head()

Unnamed: 0,status,text
0,1,рейтинг канал политический телеграм цитировани...
1,1,телегр неделя тотальный qr кодирование цензури...
2,1,новый дайджест telegram повестка неделя сделат...
3,1,самый известный оппозиционный канал telegram с...
4,1,большинство сми телегр канал давно раскупать п...


In [26]:
df_commercial_def.head()

Unnamed: 0,status,cat_name
0,1,Реклама
1,0,Не реклама


In [3]:
res = df_posts.merge(df_categories, how='left', on='index')
p = pd.DataFrame(res['cat_name'].value_counts())
p

Unnamed: 0,cat_name
Политика,1441
Экономика и бизнес,1109
Губернаторы,831
Компромат,794
Проишествия,591
Министерство ИноДел,568
Биржа и финансы,535
COVID-19,414
Банки,397
Криминал,250


In [28]:
posts_count = df_posts.shape[0]
posts_count

7778

In [29]:
commercial_count = df_commercial.shape[0]
commercial_count

928

In [30]:
import random
random_none_commercial_ids = random.sample(range(0, posts_count), commercial_count)

In [31]:
lst_none_commercial = {'status': [], 'text': []}
for i in random_none_commercial_ids:
    lst_none_commercial['text'].append(df_posts.at[i, 'text'])
    lst_none_commercial['status'].append(0)
df_none_commercial = pd.DataFrame(lst_none_commercial)

In [32]:
df_commercial_mod = df_commercial.append(df_none_commercial)
df_commercial_mod.shape[0]

1856

In [33]:
df_commercial_mod[df_commercial_mod['text']==''].index.values.astype(int)

array([], dtype=int32)

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
X_train, X_valid, y_train, y_valid = train_test_split(df_commercial_mod['text'], df_commercial_mod['status'], test_size=0.1, random_state=42)


In [36]:
X_train.shape

(1670,)

In [37]:
X_valid.shape

(186,)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [39]:
X_train.shape

(1336,)

In [40]:
X_test.shape

(334,)

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [43]:
sgd_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('sgd_clf', SGDClassifier(random_state=42))])
knb_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knb_clf', KNeighborsClassifier(n_neighbors=10))])
sgd_ppl_clf.fit(X_train, y_train)
knb_ppl_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('knb_clf', KNeighborsClassifier(n_neighbors=10))])

In [44]:
predicted_sgd = sgd_ppl_clf.predict(X_test)
print(metrics.classification_report(predicted_sgd, y_test))

              precision    recall  f1-score   support

           0       0.74      0.77      0.75       171
           1       0.75      0.72      0.73       163

    accuracy                           0.74       334
   macro avg       0.74      0.74      0.74       334
weighted avg       0.74      0.74      0.74       334



In [45]:
predicted_sgd = knb_ppl_clf.predict(X_test)
print(metrics.classification_report(predicted_sgd, y_test))

              precision    recall  f1-score   support

           0       0.79      0.75      0.77       187
           1       0.70      0.75      0.72       147

    accuracy                           0.75       334
   macro avg       0.75      0.75      0.75       334
weighted avg       0.75      0.75      0.75       334



In [46]:
parameters = { 
              'sgd_clf__loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
              'sgd_clf__class_weight':[None, 'balanced'],
              'sgd_clf__penalty':[None, 'l2', 'l1', 'elasticnet'],
              'tfidf__strip_accents':['ascii', 'unicode', None],
               'tfidf__ngram_range':[(1,2), (1,3), (1,4)]
              }
model = GridSearchCV(sgd_ppl_clf, parameters, cv=4, n_jobs=-1).fit(X_train, y_train)

In [25]:
print('Best score and parameter combination for sgd:')
print(model.best_score_, model.best_params_) 

Best score and parameter combination for sgd:
0.7806691449814126 {'sgd_clf__class_weight': None, 'sgd_clf__loss': 'log', 'sgd_clf__penalty': 'l2', 'tfidf__ngram_range': (1, 2), 'tfidf__strip_accents': 'unicode'}


In [47]:
sgd_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), strip_accents='unicode')),
    ('sgd_clf', SGDClassifier(penalty='l2', loss='log', random_state=42))
])
sgd_ppl_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(ngram_range=(1, 2), strip_accents='unicode')),
                ('sgd_clf', SGDClassifier(loss='log', random_state=42))])

In [48]:
predicted_sgd = sgd_ppl_clf.predict(X_test)
print(metrics.classification_report(predicted_sgd, y_test))

              precision    recall  f1-score   support

           0       0.81      0.76      0.79       187
           1       0.72      0.77      0.74       147

    accuracy                           0.77       334
   macro avg       0.76      0.77      0.76       334
weighted avg       0.77      0.77      0.77       334



In [49]:
predicted_sgd = sgd_ppl_clf.predict(X_valid)
print(metrics.classification_report(predicted_sgd, y_valid))

              precision    recall  f1-score   support

           0       0.78      0.74      0.76        98
           1       0.73      0.76      0.74        88

    accuracy                           0.75       186
   macro avg       0.75      0.75      0.75       186
weighted avg       0.75      0.75      0.75       186



In [50]:
from joblib import dump, load

In [51]:
dump(sgd_ppl_clf, 'commercial_model.joblib')

['commercial_model.joblib']

In [57]:
sgd_ppl_clf = load('commercial_model.joblib')

In [58]:
sgd_ppl_clf.predict([''])

array([1], dtype=int64)