In [2]:
import pandas as pd 
import re 
import numpy as np 
import pysrt 
import seaborn as sns

## Extract the labels and the data and create a dataframe from it

In [3]:
document = pysrt.open(path='EasyMovie/text-eng.srt')

In [4]:
document[0].text

'This week we are going one step further into the product teams organization'

We need  to convert the text in list of texte or raw document format 
exp : ["text", "text", "text"]

In [5]:
text = [doc.text for doc in document]

In [6]:
print(text)



['This week we are going one step further into the product teams organization', 'with first, the video that will follow, and', 'and weekly meetings for each product teams.', 'So this week with the CREATE team we have been focusing on', 'the graphical charts', 'redesign and we will begin next week with a', 'technical redesign.', 'On the MANAGE team, we focused on the hierarchy between accounts', 'and this is the start of the work on organizing accounts', 'in EasyMovie. I really need you CSMs so', 'so answer my questions on Slack.', 'so this week for the PROMOTE team it was the beginning of', "this week's project about Dr Quinn video so stay", 'Stay tuned!', 'Hi guys my priority of the week was to try and create a new process for the product team', 'to establish the product roadmap', 'so that it’s a more collaborative process with the CSMs.', 'so CSMs, you’ll be hearing from me very soon.']


In [7]:
df = pd.DataFrame(data={'name':['W42'], 'text':[text], 'template':["Product Weekly News"]})

In [8]:

df.head()




Unnamed: 0,name,text,template
0,W42,[This week we are going one step further into ...,Product Weekly News



For this vieo I don't use the rtf format that contains all the metadata because I don't know if we are going
to use it in the future. 
* If it's the case we need to know if the data will be store in a directory by template ? 
* If it's "srt" format the data must be split in directory by template type. (I think)
* If it's "srt" format the name of the video should be the name of the file.


# Feature extraction

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

With the count vectorizer we extract the vocabulary from of the texe 

In [10]:
text_studied = df.iloc[0, 1]
count_vec = CountVectorizer()
count_vec._validate_vocabulary()
text_vectized = count_vec.fit_transform(text_studied)
text_vectized

<18x85 sparse matrix of type '<class 'numpy.int64'>'
	with 148 stored elements in Compressed Sparse Row format>

In [11]:
count_vec.get_feature_names()


['about',
 'accounts',
 'and',
 'answer',
 'are',
 'be',
 'been',
 'begin',
 'beginning',
 'between',
 'charts',
 'collaborative',
 'create',
 'csms',
 'dr',
 'each',
 'easymovie',
 'establish',
 'first',
 'focused',
 'focusing',
 'follow',
 'for',
 'from',
 'further',
 'going',
 'graphical',
 'guys',
 'have',
 'hearing',
 'hi',
 'hierarchy',
 'in',
 'into',
 'is',
 'it',
 'll',
 'manage',
 'me',
 'meetings',
 'more',
 'my',
 'need',
 'new',
 'next',
 'of',
 'on',
 'one',
 'organization',
 'organizing',
 'priority',
 'process',
 'product',
 'project',
 'promote',
 'questions',
 'quinn',
 'really',
 'redesign',
 'roadmap',
 'slack',
 'so',
 'soon',
 'start',
 'stay',
 'step',
 'team',
 'teams',
 'technical',
 'that',
 'the',
 'this',
 'to',
 'try',
 'tuned',
 'very',
 'video',
 'was',
 'we',
 'week',
 'weekly',
 'will',
 'with',
 'work',
 'you']

In [12]:
count_vec.vocabulary_

{'about': 0,
 'accounts': 1,
 'and': 2,
 'answer': 3,
 'are': 4,
 'be': 5,
 'been': 6,
 'begin': 7,
 'beginning': 8,
 'between': 9,
 'charts': 10,
 'collaborative': 11,
 'create': 12,
 'csms': 13,
 'dr': 14,
 'each': 15,
 'easymovie': 16,
 'establish': 17,
 'first': 18,
 'focused': 19,
 'focusing': 20,
 'follow': 21,
 'for': 22,
 'from': 23,
 'further': 24,
 'going': 25,
 'graphical': 26,
 'guys': 27,
 'have': 28,
 'hearing': 29,
 'hi': 30,
 'hierarchy': 31,
 'in': 32,
 'into': 33,
 'is': 34,
 'it': 35,
 'll': 36,
 'manage': 37,
 'me': 38,
 'meetings': 39,
 'more': 40,
 'my': 41,
 'need': 42,
 'new': 43,
 'next': 44,
 'of': 45,
 'on': 46,
 'one': 47,
 'organization': 48,
 'organizing': 49,
 'priority': 50,
 'process': 51,
 'product': 52,
 'project': 53,
 'promote': 54,
 'questions': 55,
 'quinn': 56,
 'really': 57,
 'redesign': 58,
 'roadmap': 59,
 'slack': 60,
 'so': 61,
 'soon': 62,
 'start': 63,
 'stay': 64,
 'step': 65,
 'team': 66,
 'teams': 67,
 'technical': 68,
 'that': 69,
 'th


# TF-IDF

we need to compute the frequency of word in the document and give more importance meaningful term  reduce 
the importance of words like 'the', 'of'. 

TF-IDF 

* compute TF = 1 / terme-frequency 
* IDF = log(total_number_of_document / frequncy_of_word_in_document)
* tf-idf = TF * IDF 

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(text)

In [14]:
columns = [corpus for corpus in text_studied]
features = tfidf_vectorizer.get_feature_names()

df = pd.DataFrame(tfidf_features.todense(), index=columns, columns=features)
df

Unnamed: 0,about,accounts,and,answer,are,be,been,begin,beginning,between,...,very,video,was,we,week,weekly,will,with,work,you
This week we are going one step further into the product teams organization,0.0,0.0,0.0,0.0,0.318786,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.228944,0.195954,0.0,0.0,0.0,0.0,0.0
"with first, the video that will follow, and",0.0,0.0,0.282865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.373946,0.0,0.0,0.0,0.0,0.373946,0.306823,0.0,0.0
and weekly meetings for each product teams.,0.0,0.0,0.286536,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.432768,0.0,0.0,0.0,0.0
So this week with the CREATE team we have been focusing on,0.0,0.0,0.0,0.0,0.0,0.0,0.372658,0.0,0.0,0.0,...,0.0,0.0,0.0,0.267634,0.229068,0.0,0.0,0.267634,0.0,0.0
the graphical charts,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
redesign and we will begin next week with a,0.0,0.0,0.28545,0.0,0.0,0.0,0.0,0.431129,0.0,0.0,...,0.0,0.0,0.0,0.309627,0.26501,0.0,0.377363,0.309627,0.0,0.0
technical redesign.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"On the MANAGE team, we focused on the hierarchy between accounts",0.0,0.295635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.337756,...,0.0,0.0,0.0,0.242568,0.0,0.0,0.0,0.0,0.0,0.0
and this is the start of the work on organizing accounts,0.0,0.315786,0.238871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.360779,0.0
in EasyMovie. I really need you CSMs so,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.366162


In [15]:
## Latent derichelet allocation 



In [16]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=7, random_state=42)

LDA.fit(text_vectized)

 

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=7, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [17]:
topic = LDA.components_[0]

type(topic)

numpy.ndarray

In [18]:
# sorted from least to most important word 
sortedIndex = topic.argsort()[-10: ]
sortedIndex

array([ 1, 83, 63, 49, 34, 71, 45,  2, 46, 70])

In [19]:
for index in sortedIndex:
    print(count_vec.get_feature_names()[index])

accounts
work
start
organizing
is
this
of
and
on
the


In [20]:
import spacy 
import spacy.cli
spacy.cli.download("en_core_web_sm")
spacy_nlp = spacy.load('en_core_web_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [21]:
string = ""
for doc in document:
    string += doc.text
doc = spacy_nlp(string)
features_with_no_stop_word = [word.text for word in doc if not (word.is_stop or word.is_punct)]

In [22]:
features_with_no_stop_word


['week',
 'going',
 'step',
 'product',
 'teams',
 'organizationwith',
 'video',
 'follow',
 'andand',
 'weekly',
 'meetings',
 'product',
 'teams',
 'week',
 'CREATE',
 'team',
 'focusing',
 'onthe',
 'graphical',
 'chartsredesign',
 'begin',
 'week',
 'atechnical',
 'redesign',
 'MANAGE',
 'team',
 'focused',
 'hierarchy',
 'accountsand',
 'start',
 'work',
 'organizing',
 'accountsin',
 'EasyMovie',
 'need',
 'CSMs',
 'soso',
 'answer',
 'questions',
 'Slack.so',
 'week',
 'PROMOTE',
 'team',
 'beginning',
 'ofthis',
 'week',
 'project',
 'Dr',
 'Quinn',
 'video',
 'stayStay',
 'tuned!Hi',
 'guys',
 'priority',
 'week',
 'try',
 'create',
 'new',
 'process',
 'product',
 'teamto',
 'establish',
 'product',
 'roadmapso',
 'collaborative',
 'process',
 'CSMs.so',
 'CSMs',
 'hearing',
 'soon']

In [23]:
article_with_no_stop = ""
for word in features_with_no_stop_word:
    article_with_no_stop += " {}".format(word) 

article_with_no_stop = [article_with_no_stop]

In [24]:
article_vectorized = count_vec.fit_transform(article_with_no_stop)

In [25]:
article_vectorized

<1x56 sparse matrix of type '<class 'numpy.int64'>'
	with 56 stored elements in Compressed Sparse Row format>

In [29]:
LDA.fit(article_vectorized)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=7, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [37]:
sortedIndex = LDA.components_[4].argsort()[-10:]

In [38]:
for index in sortedIndex:
    print(count_vec.get_feature_names()[index])

guys
hearing
hi
hierarchy
manage
meetings
need
new
focusing
work


0.14285714285714285