## Оценка сложности фильмомв

Основная цель: обучить модель, которая сможет определять сложность фильма для понимания на английском языке. В качестве градации будет использована стандартная система оценки уровня по CEFR (А1, А2, В1, В2, С1, С2).

В данной работе необходимо:
- обрработать субтитры к фильмам и подготовить их для обучения модели.
- обучить модель и получить приемлимую метрику accuracy.
- упаковать модель в необходимом виде и предоставить заказчику.

## Выгрузка и подготовка данных

Подгружаем необходимые библиотеки и создаем переменные

In [2]:
import pysrt
import pandas as pd
import os
import re
from bs4 import BeautifulSoup
from datetime import datetime
import unicodedata
import spacy
from pypdf import PdfReader
from sklearn.ensemble import RandomForestClassifier as RFC
from catboost import CatBoostClassifier as CBC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from notifiers import get_notifier
import copy
import pickle
import converter

In [351]:
OPTION_CONVERT = 1
OPTION_RENAME = 0
RANDOM_STATE = 8974651

In [352]:
def make_notifier(
    token='5658096333:AAHiwKbKFkFDhJH-0eSKwDeXmJnYsqXc88A',
    chat_id=-831532630):
    def f(text):
        notifier = get_notifier('telegram')
        notifier.notify(
            message=text,
            token=token,
            chat_id=chat_id)
    return f

In [353]:
bot_send = make_notifier()

In [354]:
pd.set_option('display.max_rows', None)

In [355]:
if OPTION_CONVERT:
    labels = pd.read_csv('./films_table.csv')
else:
    labels = pd.read_csv('./prepared_df.csv')

In [356]:
def get_info(df):
    display(df.sample(5))
    display(df.info())

In [357]:
get_info(labels)

Unnamed: 0,name,level,subtitles,type,source,file,Unnamed: 6
198,The Green Knight (2021),C2,Yes,movie,https://magazine.skyeng.ru/6-novyh-filmov-dlja...,The.Green.Knight.2021.varivoda_sp.srt,
99,Entrapment,B2,Yes,movie,unknow,Entrapment.srt,
6,Toy story,A2/A2+,Yes,movie,unknow,Toy_story(1995).srt,
78,The Greatest Showman ️,A2/A2+,Yes,movie,unknow,The_greatest_showman(2017).srt,
163,The School of Rock,B2,Yes,movie,https://easyspeak.ru/blog/sovety-i-sekrety/luc...,The_school_of_rock.srt,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        204 non-null    object
 1   level       204 non-null    object
 2   subtitles   204 non-null    object
 3   type        204 non-null    object
 4   source      204 non-null    object
 5   file        204 non-null    object
 6   Unnamed: 6  1 non-null      object
dtypes: object(7)
memory usage: 11.3+ KB


None

In [358]:
labels = labels.rename(columns={'name':'Movie','level':'Level'})

In [359]:
get_info(labels)

Unnamed: 0,Movie,Level,subtitles,type,source,file,Unnamed: 6
91,Milada,B1,Yes,movie,unknow,Milada(2017).srt,
147,Junior,B1,Yes,movie,https://easyspeak.ru/blog/sovety-i-sekrety/luc...,Junior.srt,
13,The man called Flinstone,A2/A2+,Yes,movie,unknow,The_man_called_Flintstone(1966).srt,
188,Tesla (2016),B2,Yes,movie,https://moviesbylevels.wordpress.com,Tesla.2020.varivoda_sp.srt,
168,Captain Corelli's Mandolin,B2,Yes,movie,https://easyspeak.ru/blog/sovety-i-sekrety/luc...,Captain_corellis_mandolin.srt,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Movie       204 non-null    object
 1   Level       204 non-null    object
 2   subtitles   204 non-null    object
 3   type        204 non-null    object
 4   source      204 non-null    object
 5   file        204 non-null    object
 6   Unnamed: 6  1 non-null      object
dtypes: object(7)
memory usage: 11.3+ KB


None

In [360]:
labels = labels.loc[:,['Movie','Level']]

В данных есть пропуски, но в важных столбцах с названиями и оценками все хорошо.

Для будущей работы с названиями приведем их к одному формату

In [361]:
for i in range(len(labels)):
    labels.loc[i,'Movie'] = labels.loc[i,'Movie'].lower()
    labels.loc[i,'Movie'] = labels.loc[i,'Movie'].replace(' ','_')
    if labels.loc[i,'Movie'][-1:] == "\n": labels.loc[i,'Movie'] = labels.loc[i,'Movie'][:-1]
    labels.loc[i,'Movie'] = labels.loc[i,'Movie'].replace("’",'')
    if labels.loc[i,'Movie'][-1] == "_": labels.loc[i,'Movie'] = labels.loc[i,'Movie'][:-1]
    j = -2
    while labels.loc[i,'Movie'][-1] == ")":
        if labels.loc[i,'Movie'][j] == "(":
            labels.loc[i,'Movie'] = labels.loc[i,'Movie'][:j-1]
        else:
            j -= 1
    labels.loc[i,'Movie'] = labels.loc[i,'Movie'].replace('-','_')
    labels.loc[i,'Movie'] = labels.loc[i,'Movie'].replace(':','_')
    labels.loc[i,'Movie'] = labels.loc[i,'Movie'].replace('__','_')
    labels.loc[i,'Movie'] = labels.loc[i,'Movie'].replace(',','')
    labels.loc[i,'Movie'] = labels.loc[i,'Movie'].replace('.','')
    labels.loc[i,'Movie'] = labels.loc[i,'Movie'].replace('&','and')
    labels.loc[i,'Movie'] = labels.loc[i,'Movie'].replace('(','')
    labels.loc[i,'Movie'] = labels.loc[i,'Movie'].replace(')','')
    labels.loc[i,'Movie'] = labels.loc[i,'Movie'].replace("'",'')

Так же поступим с названиями фильмов

In [362]:
if OPTION_RENAME:
    names = os.listdir('./subtitles/')
    j = 0
    for name in names:
        n = copy.copy(name)
        name = name.lower()
        name = re.sub(r"\.varivoda_sp|\(\d{4}\)|\d{4}|'",'', name)
        name = re.sub(r' ','_', name)
        i = -4
        if name[-5] == '.': i = -5
        name = re.sub(r'[\.-]','_', name[:i]) + ".srt"
        name = re.sub(r'_+','_',name)
        try:
            os.rename(f'./subtitles/{n}',f'./subtitles/{name}')
        except:
            print(n)

Вроде бы все сделано, нужно только проверить.

In [363]:
content = os.listdir('./subtitles/')
names = list(labels['Movie'])
not_in_name = []
not_in_content = []
for i in names:
    if f'{i}.srt' not in content: not_in_content.append(i)
for i in content:
    if i[:-4] not in names: not_in_name.append(i)

In [364]:
not_in_name

['brenв_brown_the_call_to_courage.srt',
 'eurovision_song_contest.srt',
 'moulin_rouge.srt',
 'peppa_pig_my_first_cinema_experience.srt',
 'the_card_counter.srt',
 'the_extra_terrestrial.srt',
 'the_greatest_showman.srt',
 'the_man_called_flintstone.srt',
 'wallace_grommit.srt',
 'we_are_the_millers.srt']

In [365]:
not_in_content.sort()
not_in_content

['\r\npeppa_pig_my_first_cinema_experience',
 'addams_family_values',
 'brené_brown_the_call_to_courage',
 'card_counter',
 'et_the_extra_terrestrial',
 'eurovision_song_contest_the_story_of_fire_saga',
 'moulin_rouge_️',
 'the_greatest_showman_️',
 'the_man_called_flinstone',
 'wallace_and_grommit_the_curse_of_the_were_rabbit',
 'were_the_millers']

Некоторые названия не попадают под шаблоны, по этому приводим их к нужным именам ручками.

In [366]:
labels.loc[labels['Movie'] == 'the_man_called_flinstone','Movie'] = 'the_man_called_flintstone'
labels.loc[labels['Movie'] == 'eurovision_song_contest_the_story_of_fire_saga','Movie'] = 'eurovision_song_contest'
labels.loc[labels['Movie'] == 'were_the_millers','Movie'] = 'we_are_the_millers'
labels.loc[labels['Movie'] == '\r\npeppa_pig_my_first_cinema_experience','Movie'] = 'peppa_pig_my_first_cinema_experience'
labels.loc[labels['Movie'] == 'brené_brown_the_call_to_courage','Movie'] = 'brenв_brown_the_call_to_courage'
labels.loc[labels['Movie'] == 'wallace_and_grommit_the_curse_of_the_were_rabbit','Movie'] = 'wallace_grommit'
labels.loc[labels['Movie'] == 'mechanic:_resurrection','Movie'] = 'mechanic_resurrection'
labels.loc[labels['Movie'] == 'moulin_rouge_️','Movie'] = 'moulin_rouge'
labels.loc[labels['Movie'] == 'et_the_extra_terrestrial','Movie'] = 'the_extra_terrestrial'
labels.loc[labels['Movie'] == 'the_greatest_showman_️','Movie'] = 'the_greatest_showman'
labels.loc[labels['Movie'] == 'card_counter','Movie'] = 'the_card_counter'

Есть пара сериалов к каторым нет субтитров, просто удалим их, так как оценить сложность сериалов проблемотично, она меняется от серии к серии.

In [367]:
labels = labels.drop(index = labels.loc[labels['Movie'] == 'addams_family_values'].index).reset_index(drop=True)

In [368]:
content = os.listdir('./subtitles/')
names = list(labels['Movie'])
not_in_name = []
not_in_content = []
for i in names:
    if f'{i}.srt' not in content: not_in_content.append(i)
for i in content:
    if i[:-4] not in names: not_in_name.append(i)
not_in_content


[]

In [369]:
not_in_name

[]

Теперь не плохо.

Теперь когда все имеет единый вид, нужно проверить на дубликаты.

In [370]:
labels['Movie'].duplicated().sum()

6

In [371]:
labels.loc[labels['Movie'].duplicated()]

Unnamed: 0,Movie,Level
86,kubo_and_the_two_strings,B1
90,inside_out,B1
92,sleepless_in_seattle,B1
93,the_terminal,B1
102,the_blind_side,B2
195,banking_on_bitcoin,C1


5 штук не плохо, просто дропним.

In [372]:
labels = labels.drop_duplicates(subset = "Movie").reset_index(drop=True)

In [373]:
labels['Movie'].duplicated().sum()

0

Прекрасно

In [374]:
labels['Level'].value_counts()

B2            70
B1            68
A2/A2+        26
C1            13
B1, B2         8
A2/A2+, B1     5
A2             3
A1             2
               1
C2             1
Name: Level, dtype: int64

Корректировать быдем селдующим образом:
- A2/A2+ - А2
- B1, B2 - В2
- 2/A2+, B1 - В1

In [375]:
labels.loc[labels['Level'] == 'A2/A2+', 'Level'] = 'A2'
labels.loc[labels['Level'] == 'B1, B2', 'Level'] = 'B2'
labels.loc[labels['Level'] == 'A2/A2+, B1', 'Level'] = 'B1'

In [376]:
labels['Level'].value_counts()

B2    78
B1    73
A2    29
C1    13
A1     2
       1
C2     1
Name: Level, dtype: int64

Пропуск просто удалим. По скольку фильм С2 всего один его тоже придется убрать как и А2.

In [377]:
labels = labels.loc[(labels['Level'] != " ") & (labels['Level'] != "C2") & (labels['Level'] != "A1")]
labels['Level'].value_counts()

B2    78
B1    73
A2    29
C1    13
Name: Level, dtype: int64

### Переводим субтитры в txt

Создадим функцию которая будет убирать весь html, не нужные команды в скобках и не обчные буквы.

In [378]:
def convert_srt_to_txt(film):
    movie = pysrt.open(f'./subtitles/{film}.srt',encoding='iso-8859-1')
    for j in range(10):    
        # создаем файл txt с названием фильма и открываем субтитры
        file = open(f'./Subtitles_txt/{film}_{j}.txt', 'w+')
        labels.loc[len(labels.index),'Movie'] = f'{film}_{j}'
        labels.loc[labels['Movie'] == f'{film}_{j}', 'Level'] = labels.loc[labels['Movie'] == f'{film}', "Level"].values
        total_time = 0
        total_words = 0
        string = ''
        # проходимся по каждой строке субтитров и обрабатываем ее
        for i in range(j,len(movie),10):
            
            # приводим все в нормальную кодировку
            movie_str = unicodedata.normalize('NFKD', movie[i].text).encode('ASCII', 'ignore').decode('utf-8', 'ignore')
            
            # в некоторых субтитрах есть имена авторов и сайты источники, нам они не нужны
            try:
                re.search(r"\w{3}\..*\.\w{3}", movie_str)[0]
                continue
            except:
                next
            
            # убираем не обычные символы
            movie_str = re.sub(r'[♪]','', movie_str)
            
            # убироаем html 
            movie_str = BeautifulSoup(movie_str, "html.parser").text

            # лишние пробелы и преносы строк
            movie_str = re.sub(r'\s',' ',movie_str)
            movie_str = re.sub(r'{.*?}|\(.*?\)|[,.!?"]|- ',' ', movie_str)
            movie_str = re.sub(r'-',' ',movie_str)
            if movie_str == "": continue
            if movie_str[0] == " ": movie_str = movie_str[1:]
            movie_str = movie_str.lower()

            # подсчет слов и затраченого времени на один субтитр
            count_words = len(movie_str.split())
            duration = movie[i].duration.to_time()
            duration = duration.isoformat(timespec='milliseconds')
            duration = duration.split(':')
            duration = (int(duration[0])*60+int(duration[1]))*60 + float(duration[2]) + 0.001
        
            # записываем итоговую строку и информацию по ней
            string += f'{movie_str} '
            total_time += duration
            total_words += count_words
        
        # записываем итоговые цифры и закрываем файл
        labels.loc[labels['Movie'] == f'{film}_{j}', 'total_time'] = total_time
        labels.loc[labels['Movie'] == f'{film}_{j}', 'total_words'] = total_words
        labels.loc[labels['Movie'] == f'{film}_{j}', 'raito'] = total_words/total_time
        string = re.sub(r'\s+',' ',string)
        file.write(string)
        file.close()

Запускаем обработку субтитров

In [379]:
labels = labels.loc[(labels['Movie'] != 'banking_on_bitcoin')&(labels['Movie'] != 'sweet_home_alabama')&(labels['Movie'] != 'daddy_day_care')&(labels['Movie'] != 'frozen')]

In [380]:
l = len(labels.index)
labels = labels.reset_index(drop=True)

In [381]:
if OPTION_CONVERT:
    for name in labels.loc[:l,'Movie']:
        convert_srt_to_txt(name)



In [382]:
labels = labels.loc[l:]

In [383]:
get_info(labels)

Unnamed: 0,Movie,Level,total_time,total_words,raito
1450,youve_got_mail_1,B1,395.622,1049.0,2.651521
1785,citizen_kane_6,B2,422.318,1278.0,3.026156
1126,despicable_me_7,B2,224.022,520.0,2.321201
1285,the_hunger_games_6,B2,249.12,585.0,2.348266
1719,as_good_as_it_gets_0,B2,554.312,1280.0,2.309169


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1890 entries, 189 to 2078
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Movie        1890 non-null   object 
 1   Level        1890 non-null   object 
 2   total_time   1890 non-null   float64
 3   total_words  1890 non-null   float64
 4   raito        1890 non-null   float64
dtypes: float64(3), object(2)
memory usage: 74.0+ KB


None

In [384]:
def convert_to_lemm (name):
    nlp = spacy.load("en_core_web_sm")
    file = open(f'./Subtitles_txt/{name}.txt')
    text = file.read()
    file_lemm = open(f'./sub_lemm/{name}.txt', 'w+')
    doc = nlp(text)
    for token in doc:
        if token.is_stop: continue
        file_lemm.write(f'{token.lemma_} ')
    file.close()
    file_lemm.close()

In [385]:
if OPTION_CONVERT:
    for name in labels['Movie']:
        convert_to_lemm(name)

In [386]:
if OPTION_CONVERT:
    pdf_dic = os.listdir('./dic')  
    for dic in pdf_dic:
        reader = PdfReader(f"./dic/{dic[:-4]}.pdf")
        file = open(f'./Dic_txt/{dic[:-4]}.txt','+w',encoding="utf-8")
        number_of_pages = len(reader.pages)
        for i in range(number_of_pages):
            page = reader.pages[i].extract_text()
            page = re.sub(r'©.*13| .+?\n',' ',page)
            page = re.sub(r'\s+',' ',page)
            file.write(page)
        file.close()

In [387]:
if OPTION_CONVERT:
    txt_dic = os.listdir('./Dic_txt/')
    levels = [
            'A1',
            'A2',
            'B1',
            'B2',
            'C1',
            'C2',
            ]
    lev = {"A1":1,
        "A2":1,
        "B1":1,
        "B2":1,
        "C1":1,
        "C2":1}
    for dic in txt_dic:
        for i in range(len(levels)):
            if lev[levels[i]]:
                level = open(f'./levels/{levels[i]}.txt','+w')
                lev[levels[i]] = 0
            else:
                level = open(f'./levels/{levels[i]}.txt','a')
            file_dic = open(f'./Dic_txt/{dic}')
            text = file_dic.read()
            start = text.find(levels[i])
            try:
                end = text.find(levels[i+1])
            except:
                end = len(text)
            if start == -1: continue
            level.write(text[start+3:end])
        level.close()
        file_dic.close()

In [388]:
A1 = open('./levels/A1.txt').read()
A2 = open('./levels/A2.txt').read()
B1 = open('./levels/B1.txt').read()
B2 = open('./levels/B2.txt').read()
C1 = open('./levels/C1.txt').read()
C2 = open('./levels/C2.txt').read()
levels_txt = {'a1':A1,
            'a2':A2,
            'b1':B1,
            'b2':B2,
            'c1':C1,
            'c2':C2}

In [389]:
lemm_sub = os.listdir('./sub_lemm/')
for lemm in lemm_sub:
    sub = open(f'./sub_lemm/{lemm}').read().split()
    labels.loc[labels['Movie'] == lemm[:-4],'uniq'] = len(set(sub))
    lev_count = {'a1':0,
                 'a2':0,
                 'b1':0,
                 'b2':0,
                 'c1':0,
                 'c2':0}
    for word in sub:
        for i in lev_count.keys():
            if word in levels_txt[i]: 
                lev_count[i] += 1
                break
    for k in lev_count.keys():
        labels.loc[labels['Movie'] == lemm[:-4], k] = lev_count[k]
    lev_count = {'a1':0,
                 'a2':0,
                 'b1':0,
                 'b2':0,
                 'c1':0,
                 'c2':0}
    for word in set(sub):
        for i in lev_count.keys():
            if word in levels_txt[i]: 
                lev_count[i] += 1
                break
    for k in lev_count.keys():
        labels.loc[labels['Movie'] == lemm[:-4], f'{k}_uniq'] = lev_count[k]

In [390]:
labels.loc[labels['Level'] == 'A1', 'Level'] = 0
labels.loc[labels['Level'] == 'A2', 'Level'] = 1
labels.loc[labels['Level'] == 'B1', 'Level'] = 2
labels.loc[labels['Level'] == 'B2', 'Level'] = 3
labels.loc[labels['Level'] == 'C1', 'Level'] = 4
labels.loc[labels['Level'] == 'C2', 'Level'] = 5
labels['Level'] = labels['Level'].astype('int')

In [391]:
get_info(labels)

Unnamed: 0,Movie,Level,total_time,total_words,raito,uniq,a1,a2,b1,b2,c1,c2,a1_uniq,a2_uniq,b1_uniq,b2_uniq,c1_uniq,c2_uniq
1980,the_prince_of_egypt_1,3,292.092,639.0,2.187667,159.0,117.0,26.0,10.0,9.0,5.0,0.0,82.0,23.0,8.0,7.0,5.0,0.0
738,titanic_9,3,447.917,1230.0,2.746044,318.0,284.0,74.0,20.0,29.0,4.0,0.0,159.0,49.0,16.0,25.0,4.0,0.0
1617,just_married_8,2,371.968,848.0,2.279766,228.0,203.0,28.0,14.0,10.0,5.0,0.0,120.0,21.0,13.0,9.0,3.0,0.0
517,love_actually_8,2,535.216,1118.0,2.088876,307.0,273.0,58.0,21.0,22.0,3.0,0.0,142.0,45.0,17.0,20.0,3.0,0.0
2023,wallace_grommit_4,3,194.733,430.0,2.208152,137.0,82.0,17.0,10.0,16.0,7.0,0.0,55.0,17.0,9.0,11.0,5.0,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1890 entries, 189 to 2078
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Movie        1890 non-null   object 
 1   Level        1890 non-null   int32  
 2   total_time   1890 non-null   float64
 3   total_words  1890 non-null   float64
 4   raito        1890 non-null   float64
 5   uniq         1890 non-null   float64
 6   a1           1890 non-null   float64
 7   a2           1890 non-null   float64
 8   b1           1890 non-null   float64
 9   b2           1890 non-null   float64
 10  c1           1890 non-null   float64
 11  c2           1890 non-null   float64
 12  a1_uniq      1890 non-null   float64
 13  a2_uniq      1890 non-null   float64
 14  b1_uniq      1890 non-null   float64
 15  b2_uniq      1890 non-null   float64
 16  c1_uniq      1890 non-null   float64
 17  c2_uniq      1890 non-null   float64
dtypes: float64(16), int32(1), object(1)
memory usa

None

In [392]:
df = labels.loc[:,['Level','total_time','raito','uniq','a1','a2','b1','b2','c1','c2','a1_uniq','a2_uniq','b1_uniq','b2_uniq','c1_uniq']]

In [399]:
df = labels.loc[:,'Level':'c2_uniq']

In [43]:
tabl = df.pivot_table(index='Level',
                      values=['a1','a2','b1','b2','c1','c2'],
                      aggfunc='mean')

In [44]:
tabl

Unnamed: 0_level_0,a1,a2,b1,b2,c1,c2
Level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1944.172414,371.862069,140.034483,148.517241,59.206897,0.0
2,2192.112676,415.732394,160.253521,159.169014,68.478873,0.0
3,2207.272727,456.74026,192.623377,178.454545,80.38961,0.0
4,2041.833333,433.916667,169.0,167.833333,79.416667,0.0


In [308]:
get_info(df)

Unnamed: 0,Level,total_time,raito,uniq,a1,a2,b1,b2,c1,c2,a1_uniq,a2_uniq,b1_uniq,b2_uniq,c1_uniq
1726,3,488.327,2.633481,397.0,0.202177,0.064541,0.026439,0.027216,0.013997,0.0,0.413098,0.176322,0.080605,0.078086,0.042821
681,2,538.063,2.217212,316.0,0.26404,0.04694,0.01425,0.012573,0.005868,0.0,0.506329,0.148734,0.053797,0.041139,0.022152
1447,3,356.936,3.098595,339.0,0.245931,0.04792,0.026221,0.018083,0.01085,0.0,0.483776,0.135693,0.053097,0.058997,0.035398
859,3,457.795,2.37661,323.0,0.22886,0.040441,0.022978,0.016544,0.009191,0.0,0.470588,0.130031,0.065015,0.04644,0.027864
1055,4,385.782,2.975774,319.0,0.222125,0.04878,0.030488,0.02439,0.011324,0.0,0.426332,0.15047,0.100313,0.084639,0.037618


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1890 entries, 0 to 1889
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Level       1890 non-null   int32  
 1   total_time  1890 non-null   float64
 2   raito       1890 non-null   float64
 3   uniq        1890 non-null   float64
 4   a1          1890 non-null   float64
 5   a2          1890 non-null   float64
 6   b1          1890 non-null   float64
 7   b2          1890 non-null   float64
 8   c1          1890 non-null   float64
 9   c2          1890 non-null   float64
 10  a1_uniq     1890 non-null   float64
 11  a2_uniq     1890 non-null   float64
 12  b1_uniq     1890 non-null   float64
 13  b2_uniq     1890 non-null   float64
 14  c1_uniq     1890 non-null   float64
dtypes: float64(14), int32(1)
memory usage: 293.4 KB


None

In [400]:
train_df, test_df = train_test_split(df, test_size=.25, random_state=RANDOM_STATE)

In [310]:
train_df['Level'].value_counts()

3    571
2    526
1    224
4     96
Name: Level, dtype: int64

In [311]:
test_df['Level'].value_counts()

3    199
2    184
1     66
4     24
Name: Level, dtype: int64

In [401]:
train_df_features = train_df.drop(["Level"], axis= 1)
train_df_ans = train_df["Level"]
test_df_features = test_df.drop(["Level"], axis= 1)
test_df_ans = test_df["Level"]

In [402]:
column = train_df_features.columns
scaler = StandardScaler()
scaler.fit(train_df_features[column])
pd.options.mode.chained_assignment = None
train_df_features[column] = scaler.transform(train_df_features[column])
test_df_features[column] = scaler.transform(test_df_features[column])

In [404]:
with open('./models/scaller.pickle', 'wb') as scal:
    pickle.dump(scaler, scal)

In [403]:
parametrs = {'depth' :[6],
            'iterations' : [2550],
            'learning_rate' : [.05],
            'l2_leaf_reg': [3]}
cbc = GridSearchCV(CBC(random_seed= RANDOM_STATE,
                        logging_level='Silent',
                        auto_class_weights='Balanced'), 
                    parametrs, 
                    scoring = 'accuracy',
                    verbose = 3,
                    cv=3)
cbc.fit(train_df_features, train_df_ans)
print (cbc.best_params_)
print (cbc.best_score_)
bot_send(f'{cbc.best_score_}')
for s in cbc.best_params_:
    bot_send (f'{s}: {cbc.best_params_[s]}')
score = accuracy_score(test_df_ans,cbc.predict(test_df_features))
print(score)
bot_send(f'{score}')

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END depth=6, iterations=2550, l2_leaf_reg=3, learning_rate=0.05;, score=0.677 total time=  10.8s
[CV 2/3] END depth=6, iterations=2550, l2_leaf_reg=3, learning_rate=0.05;, score=0.667 total time=  10.3s
[CV 3/3] END depth=6, iterations=2550, l2_leaf_reg=3, learning_rate=0.05;, score=0.674 total time=  10.2s
{'depth': 6, 'iterations': 2550, 'l2_leaf_reg': 3, 'learning_rate': 0.05}
0.6725448214904265
0.7420718816067653


In [405]:
with open('./models/model.pickle', 'wb') as model:
    pickle.dump(cbc,model)

In [3]:
with open('./models/model.pickle', 'rb') as m:
    cbc = pickle.load(m)

In [4]:
best = cbc.best_estimator_

In [5]:
best.feature_importances_

array([13.78589158,  8.22839241, 14.64950686, 11.02868594, 10.36857249,
        4.99371282,  5.08134256,  5.80630085,  5.0785227 ,  0.        ,
        5.84152358,  4.31965299,  3.79869197,  3.88507827,  3.13412498,
        0.        ])

In [6]:
best.feature_names_

['total_time',
 'total_words',
 'raito',
 'uniq',
 'a1',
 'a2',
 'b1',
 'b2',
 'c1',
 'c2',
 'a1_uniq',
 'a2_uniq',
 'b1_uniq',
 'b2_uniq',
 'c1_uniq',
 'c2_uniq']

In [315]:
parametrs = {'n_estimators' : [328],
            'max_depth' :[3],
            'min_samples_leaf' : range(2,6),
            'min_samples_split': range(2,11,2)}
rfc = GridSearchCV(RFC(random_state= RANDOM_STATE, class_weight = 'balanced'), 
                    parametrs, 
                    scoring = 'accuracy', 
                    verbose = 3,
                    cv=2)
rfc.fit(train_df_features, train_df_ans)
print (rfc.best_params_)
print (rfc.best_score_)
bot_send(f'{rfc.best_score_}')
for s in rfc.best_params_:
    bot_send (f'{s}: {rfc.best_params_[s]}')
score = accuracy_score(test_df_ans,rfc.predict(test_df_features))
print(score)
bot_send(f'{score}')

Fitting 2 folds for each of 20 candidates, totalling 40 fits
[CV 1/2] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=328;, score=0.422 total time=   0.5s
[CV 2/2] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=328;, score=0.448 total time=   0.5s
[CV 1/2] END max_depth=3, min_samples_leaf=2, min_samples_split=4, n_estimators=328;, score=0.422 total time=   0.5s
[CV 2/2] END max_depth=3, min_samples_leaf=2, min_samples_split=4, n_estimators=328;, score=0.448 total time=   0.5s
[CV 1/2] END max_depth=3, min_samples_leaf=2, min_samples_split=6, n_estimators=328;, score=0.426 total time=   0.5s
[CV 2/2] END max_depth=3, min_samples_leaf=2, min_samples_split=6, n_estimators=328;, score=0.444 total time=   0.5s
[CV 1/2] END max_depth=3, min_samples_leaf=2, min_samples_split=8, n_estimators=328;, score=0.427 total time=   0.5s
[CV 2/2] END max_depth=3, min_samples_leaf=2, min_samples_split=8, n_estimators=328;, score=0.442 total time=   0.5s
[CV