In [1]:
import pandas as pd
from sklearn.metrics import classification_report

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Загрузка и очистка данных

In [3]:
df = pd.read_csv('drive/MyDrive/school.csv', index_col=0)
df

Unnamed: 0,main_page,school
0,"\nздравствуйте\n,\nвы сейчас на главной страни...",1
1,\nхостинг от \nucoz\nуважаемые пользователи!\n...,1
2,,0
3,\n #js-show-iframe-wrapper{position:relative;d...,1
4,\n адрес школы\nадрес: \nадрес: ул. л...,1
...,...,...
3625,\nubooki\nглавная\nкниги\n© ubooki\nсделать \n...,0
3626,"\nмбоу""кандатская средняя школа"" - главная стр...",1
3627,\nглавная\nфото\nистория\nправила\nвидео\nтерм...,0
3628,"\nдобр\nо пожаловать\n!\n8 ""а"" класс \n""эврика...",1


In [4]:
df.isnull().sum()

main_page    26
school        0
dtype: int64

In [5]:
df.dropna(inplace=True)
df.reset_index(inplace=True,drop=True)
df

Unnamed: 0,main_page,school
0,"\nздравствуйте\n,\nвы сейчас на главной страни...",1
1,\nхостинг от \nucoz\nуважаемые пользователи!\n...,1
2,\n #js-show-iframe-wrapper{position:relative;d...,1
3,\n адрес школы\nадрес: \nадрес: ул. л...,1
4,\nк юбилею любимой школе!\nмоу дубковская сош\...,1
...,...,...
3599,\nubooki\nглавная\nкниги\n© ubooki\nсделать \n...,0
3600,"\nмбоу""кандатская средняя школа"" - главная стр...",1
3601,\nглавная\nфото\nистория\nправила\nвидео\nтерм...,0
3602,"\nдобр\nо пожаловать\n!\n8 ""а"" класс \n""эврика...",1


In [33]:
df['school'].value_counts()

0    1843
1    1761
Name: school, dtype: int64

In [6]:
df = df.replace(to_replace=r'[^а-яА-ЯёЁa-zA-Z]', value=' ',regex=True)
df['main_page'] = df['main_page'].str.lower()

### Бейзлайн на правилах

In [7]:
X = df.drop(columns=['school'])
y = df['school']

In [31]:
X['prediction'] = df['main_page'].apply(lambda x: 1 if ('школ' in x) or ('school' in x)  else 0)
X

Unnamed: 0,main_page,prediction
0,здравствуйте вы сейчас на главной странице ...,1
1,хостинг от ucoz уважаемые пользователи мы в...,0
2,js show iframe wrapper position relative di...,1
3,адрес школы адрес адрес ул луна...,1
4,к юбилею любимой школе моу дубковская сош на...,1
...,...,...
3599,ubooki главная книги ubooki сделать беспла...,0
3600,мбоу кандатская средняя школа главная стра...,1
3601,главная фото история правила видео терминолог...,0
3602,добр о пожаловать а класс эврика дев...,1


In [32]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

print(accuracy_score(y,X.prediction))
f1_score(y,X.prediction)

0.9087125416204217


0.9066136815214306

In [10]:
print(classification_report(X.prediction, y))

              precision    recall  f1-score   support

           0       0.92      0.88      0.90      1914
           1       0.87      0.91      0.89      1690

    accuracy                           0.89      3604
   macro avg       0.89      0.89      0.89      3604
weighted avg       0.89      0.89      0.89      3604



# BOW + логрег

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [12]:
x_train, x_test, y_train, y_test = train_test_split(df['main_page'], df['school'])

In [13]:
vec = CountVectorizer(ngram_range=(1, 2))
bow = vec.fit_transform(x_train)

In [14]:
clf = LogisticRegression(max_iter=500)
clf.fit(bow, y_train)

In [15]:
pred = clf.predict(vec.transform(x_test))
print(classification_report(y_test, pred))
print(accuracy_score(y_test,pred))
f1_score(y_test,pred)

              precision    recall  f1-score   support

           0       0.88      0.94      0.91       467
           1       0.93      0.86      0.89       434

    accuracy                           0.90       901
   macro avg       0.90      0.90      0.90       901
weighted avg       0.90      0.90      0.90       901

0.8990011098779135


0.8912783751493428

#TF-IDF + логрег

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
vec = TfidfVectorizer(ngram_range=(1, 3))
bow = vec.fit_transform(x_train)
clf = LogisticRegression(max_iter = 500)
clf.fit(bow, y_train)


In [18]:
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))
print(accuracy_score(y_test,pred))
f1_score(y_test,pred)

              precision    recall  f1-score   support

           0       0.92      0.88      0.90       488
           1       0.86      0.91      0.88       413

    accuracy                           0.89       901
   macro avg       0.89      0.89      0.89       901
weighted avg       0.89      0.89      0.89       901

0.8901220865704772


0.8831168831168831

# Градиентный бустинг

In [19]:
%%capture
!pip install xgboost

In [20]:
import xgboost

In [27]:
boosting_model = xgboost.XGBClassifier(n_estimators=500)
boosting_model.fit(bow, y_train)

In [28]:
pred = boosting_model.predict(vec.transform(x_test))
print(classification_report(y_test, pred))
print(accuracy_score(y_test,pred))
f1_score(y_test,pred)

              precision    recall  f1-score   support

           0       0.92      0.93      0.93       467
           1       0.92      0.92      0.92       434

    accuracy                           0.92       901
   macro avg       0.92      0.92      0.92       901
weighted avg       0.92      0.92      0.92       901

0.9223085460599334


0.9191685912240184

In [24]:
%%capture
!pip install catboost
import catboost

In [25]:
boosting_model = catboost.CatBoostClassifier(n_estimators=200,task_type="GPU",verbose=500)

boosting_model.fit(bow, y_train)

<catboost.core.CatBoostClassifier at 0x7ce9044fe710>

In [26]:
pred = boosting_model.predict(vec.transform(x_test))
print(classification_report(y_test, pred))
print(accuracy_score(y_test,pred))
f1_score(y_test,pred)

              precision    recall  f1-score   support

           0       0.90      0.93      0.91       467
           1       0.92      0.89      0.90       434

    accuracy                           0.91       901
   macro avg       0.91      0.91      0.91       901
weighted avg       0.91      0.91      0.91       901

0.9078801331853497


0.9029239766081871