## Семинар 1 Индекс

## Intro

### работа с файлами и папками

In [1]:
import os

curr_dir = os.getcwd()
filepath = os.path.join(curr_dir, 'test.txt')

### os.path  
путь до файла

In [2]:
# возвращает полный путь до папки/файла по имени файла / папки
print(os.path.abspath(filepath))


# возвращает имя файла / папки по полному ти до него
print(os.path.basename(filepath))


# проверить существование директории - True / False
print(os.path.exists(curr_dir))

/Users/victoriaregina/infosearch/1 Index/test.txt
test.txt
True


### os.listdir  
возвращает список файлов в данной директории

In [3]:
os.listdir(curr_dir)

['.ipynb_checkpoints', 'lec1_Index.pdf', 'sem1_Index.ipynb', 'test.txt']

При обходе файлов не забывайте исключать системные директории, такие как .DS_Store

### os.walk
root - начальная директория  
dirs - список поддиректорий (папок)   
files - список файлов в этих поддиректориях  

In [4]:
for root, dirs, files in os.walk(curr_dir):
    for name in files:
        print(os.path.join(root, name))

/Users/victoriaregina/infosearch/1 Index/lec1_Index.pdf
/Users/victoriaregina/infosearch/1 Index/sem1_Index.ipynb
/Users/victoriaregina/infosearch/1 Index/test.txt
/Users/victoriaregina/infosearch/1 Index/.ipynb_checkpoints/sem1_Index-checkpoint.ipynb


> __os.walk__ возвращает генератор, это значит, что получить его элементы можно только проитерировавшись по нему  
но его легко можно превратить в list и увидеть все его значения

In [5]:
list(os.walk(curr_dir))

[('/Users/victoriaregina/infosearch/1 Index',
  ['.ipynb_checkpoints'],
  ['lec1_Index.pdf', 'sem1_Index.ipynb', 'test.txt']),
 ('/Users/victoriaregina/infosearch/1 Index/.ipynb_checkpoints',
  [],
  ['sem1_Index-checkpoint.ipynb'])]

### чтение файла 

In [6]:
fpath = 'test.txt'


# одним массивом  
with open(fpath, 'r') as f:  
    text = f.read() 

    
#по строкам, в конце каждой строки \n  
with open(fpath, 'r') as f:   
    text = f.readlines() 

    
#по строкам, без \n   
with open(fpath, 'r') as f:   
    text = f.read().splitlines() 

Напоминание про enumerate:    
> При итерации по списку вы можете помимо самого элемента получить его порядковый номер    
``` for i, element in enumerate(your_list): ...  ```    
Иногда для получения элемента делают так -  ``` your_list[i] ```, не надо так

##  Индекс 

Сам по себе индекс - это просто формат хранения данных, он не может осуществлять поиск. Для этого необходимо добавить к нему определенную метрику. Это может быть что-то простое типа булева поиска, а может быть что-то более специфическое или кастомное под задачу.

Давайте посмотрим, что полезного можно вытащить из самого индекса.    
По сути, индекс - это информация о частоте встречаемости слова в каждом документе.   
Из этого можно понять, например:
1. какое слово является самым часто употребимым / редким
2. какие слова встречаются всегда вместе - так можно парсить твиттер, fb, форумы и отлавливать новые устойчивые выражения в речи
3. как эти документы кластеризуются по N тематикам согласно словам, которые в них упоминаются 

## __Задача__: 

**Data:** Коллекция субтитров сезонов Друзьей. Одна серия - один документ.

**To do:** Постройте небольшой модуль поискового движка, который сможет осуществлять поиск по коллекции документов.
На входе запрос и проиндексированная коллекция (в том виде, как посчитаете нужным), на выходе отсортированный по релевантности с запросом список документов коллекции. 

Релизуйте:
    - функцию препроцессинга данных
    - функцию индексирования данных
    - функцию метрики релевантности 
    - собственно, функцию поиска

[download_friends_corpus](https://yadi.sk/d/yVO1QV98CDibpw)

Напоминание про defaultdict: 
> В качестве multiple values словаря рекомендую использовать ``` collections.defaultdict ```                          
> Так можно избежать конструкции ``` dict.setdefault(key, default=None) ```

In [1]:
import os
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [2]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

In [3]:
import nltk
import collections
import re

In [4]:
nltk.download("stopwords")
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/victoriaregina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
docs = []
seasons = []
names_docs = []
folders = [f for f in os.listdir('/Users/victoriaregina/Downloads/friends') if 'Store' not in f]
for f in folders:
    for file in os.listdir('/Users/victoriaregina/Downloads/friends/'+ f):
        filepath = '/Users/victoriaregina/Downloads/friends/'+ f + '/'+ file
        names_docs.append(filepath)
        with open(filepath, 'r') as doc:
            normal_forms = []
            doc = doc.read()
            tokenizer = RegexpTokenizer(r'\w+')
            for t in tokenizer.tokenize(doc):
                t = morph.parse(t)[0]
                if t.normal_form not in russian_stopwords and t.normal_form != 'это' and not re.match(r'[0-9A-Za-z]', t.normal_form):
                    normal_forms.append(t.normal_form)
                    st = ' '.join(normal_forms)
            docs.append(st)
    seasons.append(st)

In [7]:
print(len(names_docs))
print(len(seasons))

165
7


In [8]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = docs

vec = TfidfVectorizer()
X = vec.fit_transform(corpus)

df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names(), index=names_docs)

In [9]:
df.head(165)

Unnamed: 0,аа,ааа,аааа,ааааа,ааааааа,аааааау,аарон,аба,аббатство,абонемент,...,ёвить,ёй,ёкнуть,ёлка,ёлочный,ёпэрэсотэ,ёрл,ёрш,ёршик,ёще
/Users/victoriaregina/Downloads/friends/Friends - season 1/Friends - 1x01 - The One Where Monica Gets A Roommate.ru.txt,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0
/Users/victoriaregina/Downloads/friends/Friends - season 1/Friends - 1x02 - The One With The Sonogram At The End.ru.txt,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0
/Users/victoriaregina/Downloads/friends/Friends - season 1/Friends - 1x03 - The One With The Thumb.ru.txt,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0
/Users/victoriaregina/Downloads/friends/Friends - season 1/Friends - 1x04 - The One With George Stephanopoulos.ru.txt,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0
/Users/victoriaregina/Downloads/friends/Friends - season 1/Friends - 1x05 - The One With The East German Laundry Detergent.ru.txt,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0
/Users/victoriaregina/Downloads/friends/Friends - season 1/Friends - 1x06 - The One With The Butt.ru.txt,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0
/Users/victoriaregina/Downloads/friends/Friends - season 1/Friends - 1x07 - The One With The Blackout.ru.txt,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0
/Users/victoriaregina/Downloads/friends/Friends - season 1/Friends - 1x08 - The One Where Nana Dies Twice.ru.txt,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0
/Users/victoriaregina/Downloads/friends/Friends - season 1/Friends - 1x09 - The One Where Underdog Gets Away.ru.txt,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0
/Users/victoriaregina/Downloads/friends/Friends - season 1/Friends - 1x10 - The One With The Monkey.ru.txt,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0


# Задания (a) и (b)

In [10]:
dic = {}
for c in vec.get_feature_names():
    dic[sum(df[c])] = c
print('Самое частотное слово:', dic[max(dic.keys())], max(dic.keys()))
print('Самое редкое слово:', dic[min(dic.keys())], min(dic.keys()))

Самое частотное слово: весь 22.42568418527917
Самое редкое слово: элегантный 0.025461852769324996


# Задание (c)

In [11]:
in_all_docs = []
d = df.isin([0])
for c in vec.get_feature_names():
    if sum(d[c]) == 0:
        in_all_docs.append(c)
print('Слова, которые есть во всех текстах:', ', '.join(in_all_docs))

Слова, которые есть во всех текстах: весь, думать, ещё, знать, мочь, просто, сказать, хотеть


# Задание (d)

In [14]:
vec = TfidfVectorizer()
X = vec.fit_transform(seasons)
df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names(), index=['season 1', 'season 2', 'season 3', 'season 4',
                                                                       'season 5', 'season 6', 'season 7'])

In [19]:
df.head(7)

Unnamed: 0,аарон,абсолютно,автомат,автоответчик,ага,агентша,ада,адский,аду,аж,...,юбочка,являться,явно,яйцо,япония,яркий,ясмина,ясно,яхта,ящик
season 1,0.0,0.0,0.040378,0.072964,0.034514,0.0,0.024321,0.0,0.024321,0.048643,...,0.024321,0.0,0.024321,0.0,0.0,0.0,0.0,0.0,0.0,0.0
season 2,0.0,0.0,0.0,0.0,0.019495,0.027476,0.0,0.027476,0.0,0.0,...,0.0,0.022807,0.0,0.0,0.0,0.0,0.0,0.019495,0.0,0.0
season 3,0.0,0.0,0.0,0.0,0.017438,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.049154,0.049154,0.024577,0.049154,0.069752,0.0,0.049154
season 4,0.0,0.014228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032775,0.0,0.0
season 5,0.0,0.017723,0.047763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
season 6,0.0,0.031085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.075692,0.0
season 7,0.059567,0.018347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.024723,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
count = {}
for i in ['season 1', 'season 2', 'season 3', 'season 4', 'season 5', 'season 6', 'season 7']:
    d = df.at[i, 'чендлер']
    count[d] = i
print('Самый популярный сезон у Чендлера:', count[max(count.keys())])

Самый популярный сезон у Чендлера: season 6


In [16]:
count = {}
for i in ['season 1', 'season 2', 'season 3', 'season 4', 'season 5', 'season 6', 'season 7']:
    d = df.at[i, 'моника']
    count[d] = i
print('Самый популярный сезон у Моники:', count[max(count.keys())])

Самый популярный сезон у Моники: season 7


# Задание (e)

In [17]:
d = {}
characters = ['росс', 'фиби', 'моника', 'чендлер', 'джо', 'рэйчел']
for character in characters:
    try:
        d[sum(df[character])] = character
    except KeyError:
        break
print('Самое популярный герой:', d[max(d.keys())], max(d.keys()))

Самое популярный герой: росс 0.440358779236699
