Извлечём тексты со всех страниц, на которые ссылается https://en.wikipedia.org/wiki/List_of_film_production_companies .

In [1]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

from bs4 import BeautifulSoup
from bs4.element import Comment
from tqdm import tqdm
import requests

from tqdm import tqdm

In [2]:
s = requests.Session()

In [3]:
u = 'https://en.wikipedia.org/wiki/List_of_film_production_companies'

In [4]:
resp = s.get(u, verify = False)
soup = BeautifulSoup(resp.text, 'html.parser')

table = soup.find('table', class_='wikitable').find('tbody')
lines = [b for b in [a.find('td') for a in table.findAll('tr')] if b]

companies = ['https://en.wikipedia.org' + c['href'] for c in [l.find('a', href=True) for l in lines] if c]

companies[:5]

['https://en.wikipedia.org/wiki/Aleph_Producciones',
 'https://en.wikipedia.org/wiki/Argentina_Sono_Film',
 'https://en.wikipedia.org/wiki/BD_Cine',
 'https://en.wikipedia.org/wiki/Guacamole_Films',
 'https://en.wikipedia.org/wiki/Patagonik_Film_Group']

In [5]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

In [6]:
def text_from_wiki(url):
    resp = s.get(url, verify = False)
    soup = BeautifulSoup(resp.text, 'html.parser')
    body = soup.find('div', class_='mw-parser-output')
    texts = body.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts).strip()

In [7]:
texts = []

for u in tqdm(companies):
    texts.append(text_from_wiki(u))

100%|██████████| 448/448 [01:11<00:00,  6.27it/s]


In [8]:
texts[0]

"Aleph Producciones S.A. is a film production company in Buenos Aires , Argentina . [1]   Filmography [ edit ]  Adolescente, sucre d'amour (1985)  Amico arabo, L' (1991)  Un Muro de silencio (1993)  Of Love and Shadows (1994)  Amigomío (1994)  Patrón (1995)  Kanya Ya Ma Kan, Beyrouth (1995)  Evita (1996)  Un Asunto privado (1996)  Dile a Laura que la quiero (1997)  Sus ojos se cerraron y el mundo sigue andando (1997)  Frontera Sur (1998)  El Evangelio de las Maravillas (1998)  Operación Fangio (1999)  El Amateur (1999)  Nueces para el amor (2000)  El Despertar de L (2001)  Sudeste (2001)  El Séptimo arcángel (2003)  Dolores de casada (2004)  18-J (2004)  ...al fin, el mar (2005)  La Manos (2006)  Suspiros del corazón (2006)  Footnotes [ edit ]   ^  Aleph Producciones S.A. at the Internet Movie Database .    External links [ edit ]"

Обучим Doc2Vec модель на этих текстах.

In [9]:
from gensim.models import doc2vec
from collections import namedtuple
import re

Из минимальной предобработки избавимся от квадратных скобок и external links.

In [10]:
texts = [re.sub(r'\[.*?\]', r'', t).replace('External links', '').strip() for t in texts]

In [11]:
texts[2]

'BD Cine (Burman Dubcovsky Cine) is a film production company in Buenos Aires , Argentina .   The firm was formed in 1995 by producer/director Daniel Burman and producer Diego Dubcovsky . According to film critic Joel Poblete, who writes for Mabuse, a cinema magazine, Daniel Burman and Diego Dubcovsky are two of the members of the New Argentina Cinema which began c. 1998.    Filmography   Plaza de almas (1997)  Un Crisantemo Estalla en Cinco Esquinas (1998)  Garage Olimpo (1999)  Esperando al Mesías (2000)  Le Loup de la côte Ouest (2002)  Todas Las Azafatas Van Al Cielo (2002)  Nadar solo (2003)  Lesbianas de Buenos Aires (2004)  18-J (2004)  El Abrazo Partido (2004)  The Motocycle Diaries (2004)  Como un avión estrellado (2005)  Un Año sin amor (2005)  Chicha tu madre (2006)  Derecho de Familia (2006)  Footnotes    ^  BD Cine at the Internet Movie Database .   ^  Poblete, Joel  Archived 2007-08-27 at the Wayback Machine . Mabuse Film Magazine, "El cine argentino está muy vital," July

In [12]:
docs = []

analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')

for i, text in enumerate(texts):
    words = text.lower().split()
    tags = [i]
    docs.append(analyzedDocument(words, tags))

Обучим простую модель размерностью 50, поскольку объём текстов небольшой.

In [13]:
model = doc2vec.Doc2Vec(docs, size = 50, window = 300, min_count = 1, workers = 4)



In [14]:
model['disney']

array([  3.2451422 ,  -7.5870852 ,  11.749499  ,   7.889385  ,
         9.463734  ,  10.941975  ,   7.8471193 ,   4.839687  ,
        -8.494088  , -15.346264  , -19.743952  , -11.947046  ,
        12.570731  ,  -4.4351325 , -10.05778   ,   5.928354  ,
        -3.112053  ,   5.160321  ,  14.241574  ,  -0.10348638,
        -2.2602983 ,   0.7450586 ,  -1.8899426 ,   0.11646719,
        -4.1230764 , -13.235517  ,  11.18217   , -10.702431  ,
         7.686716  , -22.98512   ,   7.9461217 ,   0.56797326,
         7.1035275 ,  10.462612  ,   7.7894964 ,  -7.7927804 ,
         5.384924  ,  11.453537  ,  19.004173  ,  -8.7627945 ,
        20.537712  ,  -4.45977   ,  18.609728  ,   8.43817   ,
        10.998613  , -12.531673  , -13.76489   ,  -5.7255063 ,
        -3.4124498 ,  18.898981  ], dtype=float32)

In [15]:
import os, requests, zipfile, io

In [18]:
if not os.path.exists('models'):
    os.mkdir('models')

In [19]:
model.save('models/prod_companies.d2v')

In [20]:
r = requests.get('http://nlp.stanford.edu/data/glove.6B.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall('models')