In [10]:
import io 
import multiprocessing 
import numpy 

from gensim.test.utils import common_texts, get_tmpfile 
from gensim.models import Word2Vec 
from pdfminer.converter import TextConverter 
from pdfminer.pdfinterp import PDFPageInterpreter 
from pdfminer.pdfinterp import PDFResourceManager 
from pdfminer.pdfpage import PDFPage 
from nltk.corpus import stopwords 
from pymystem3 import Mystem 
from string import punctuation 

mystem = Mystem() 
russian_stopwords = stopwords.words("russian") 

def preprocess_text(text): 
    tokens = mystem.lemmatize(text.lower()) 
    tokens = [token for token in tokens if token not in russian_stopwords and token != " " and token.strip() not in punctuation] 
    text = " ".join(tokens) 
    return text 

def extract_text_from_pdf(pdf_path): 
    resource_manager = PDFResourceManager() 
    fake_file_handle = io.StringIO() 
    converter = TextConverter(resource_manager, fake_file_handle) 
    page_interpreter = PDFPageInterpreter(resource_manager, converter) 

    with open(pdf_path, 'rb') as fh: 
        for page in PDFPage.get_pages(fh, caching = True,check_extractable = True): 
            page_interpreter.process_page(page)
            
    text = fake_file_handle.getvalue() 
    converter.close() 
    fake_file_handle.close() 

    if text: 
        return text 

text = preprocess_text(extract_text_from_pdf('Sample1.pdf')) 
cores = multiprocessing.cpu_count() 
arrWords = text.split() 
w2v_model = Word2Vec([arrWords], min_count = 1, size = 300, workers = cores - 1) 
w2v_model.wv[arrWords].mean(axis = 0)

array([ 8.23466980e-04,  3.19822211e-05,  8.18215776e-05, -2.25525233e-04,
       -3.88862652e-04,  2.16814311e-04,  1.81495070e-05,  2.04173310e-04,
       -1.08528133e-04,  2.28139805e-04, -2.23607509e-04,  2.40985719e-05,
       -4.75857029e-04,  7.61005140e-05, -8.70453150e-05, -4.39162723e-05,
       -2.65287119e-04, -3.85116844e-04, -1.67430153e-05,  2.75289960e-04,
       -1.25050487e-04,  1.49703861e-04, -9.76057781e-05,  2.24810734e-04,
        3.18581617e-04, -8.85047411e-05,  3.90435540e-04, -2.49755016e-04,
       -1.84029544e-04, -1.61901189e-04, -5.25080723e-05, -1.49526677e-04,
       -1.41590965e-04,  1.30797431e-04, -2.12659634e-05,  3.81757709e-04,
        1.47264407e-04,  3.48594825e-04,  1.05225517e-04, -4.47019731e-04,
       -2.12682993e-04, -1.11588539e-04,  1.67629958e-04,  8.06534536e-06,
        1.49040889e-05,  1.93269967e-04, -3.12783639e-04,  1.68685783e-05,
        3.88688000e-04, -8.80697189e-05,  2.11557272e-04, -1.88897291e-04,
       -1.03224826e-04,  