In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

#1.Load the Data

In [3]:
df = pd.read_csv("netflix_titles.csv")

In [4]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [5]:
df = df[["title","description"]]
df.head()

Unnamed: 0,title,description
0,Dick Johnson Is Dead,"As her father nears the end of his life, filmm..."
1,Blood & Water,"After crossing paths at a party, a Cape Town t..."
2,Ganglands,To protect his family from a powerful drug lor...
3,Jailbirds New Orleans,"Feuds, flirtations and toilet talk go down amo..."
4,Kota Factory,In a city of coaching centers known to train I...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        8807 non-null   object
 1   description  8807 non-null   object
dtypes: object(2)
memory usage: 137.7+ KB


#2.Data Pre-processing and Clustering using K-Means

In [7]:
#into unicode
desc = df['description'].values.astype("U")

In [8]:
vectorizer = TfidfVectorizer(stop_words="english")
features = vectorizer.fit_transform(desc)

In [9]:
k=10
model=KMeans(n_clusters=k, init='k-means++',max_iter=100,n_init=1)
model.fit(features)

KMeans(max_iter=100, n_clusters=10, n_init=1)

In [10]:
df['cluster']=model.labels_

In [11]:
df.head()

Unnamed: 0,title,description,cluster
0,Dick Johnson Is Dead,"As her father nears the end of his life, filmm...",2
1,Blood & Water,"After crossing paths at a party, a Cape Town t...",1
2,Ganglands,To protect his family from a powerful drug lor...,4
3,Jailbirds New Orleans,"Feuds, flirtations and toilet talk go down amo...",6
4,Kota Factory,In a city of coaching centers known to train I...,8


In [12]:
#checking centre of gravity of each cluster and printing feature terms

In [13]:
#print("Cluster Centroids: \n")

order_centroids = model.cluster_centers_.argsort()[:,::-1]
terms = vectorizer.get_feature_names()

for i in range(k):
  print("Cluster %d:" %i)
  for j in order_centroids[i, :5]: #printing 5 feature terms of each cluster
       print(' %s ' %terms[j])
  print('--------------')
    


Cluster 0:
 new 
 family 
 world 
 friends 
 home 
--------------
Cluster 1:
 town 
 small 
 new 
 girl 
 family 
--------------
Cluster 2:
 father 
 family 
 young 
 son 
 man 
--------------
Cluster 3:
 man 
 young 
 woman 
 life 
 family 
--------------
Cluster 4:
 group 
 war 
 ii 
 world 
 civil 
--------------
Cluster 5:
 young 
 woman 
 couple 
 love 
 boy 
--------------
Cluster 6:
 documentary 
 series 
 follows 
 explores 
 life 
--------------
Cluster 7:
 school 
 high 
 students 
 student 
 new 
--------------
Cluster 8:
 life 
 love 
 new 
 woman 
 family 
--------------
Cluster 9:
 killer 
 discovers 
 stop 
 serial 
 attack 
--------------




#3. Classification using BERT

In [14]:
#installing the transformers library
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 9.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 9.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 54.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 47.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstall

In [15]:
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer

In [16]:
df['cluster'].value_counts()

0    4832
8     815
6     639
5     490
3     459
7     392
4     390
2     296
9     263
1     231
Name: cluster, dtype: int64

In [17]:
#initializing a tokenizer of the BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [18]:
#example of how the tokenizer works
df['description'].iloc[0]

'As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.'

In [19]:
token=tokenizer.encode_plus(
    df['description'].iloc[0],
    max_length  = 256,    #512 is the default length
    truncation = True,
    padding = 'max_length',
    add_special_tokens = True,  #special tokens-CLS,Padding,Separation
    return_tensors = 'tf'
)

In [20]:
token

{'input_ids': <tf.Tensor: shape=(1, 256), dtype=int32, numpy=
array([[  101,  1249,  1123,  1401,  1485,  1116,  1103,  1322,  1104,
         1117,  1297,   117, 13140, 14477, 20628,  2921,  5251,  1117,
         1473,  1107,  1107, 14850,  2109,  1105,  4824,  1348,  3242,
         1106,  1494,  1172,  1241,  1339,  1103, 14014,   119,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0, 

In [21]:
#initializing input id and attention mask arrays for BERT model
X_input_ids = np.zeros((len(df),256))
X_attn_masks = np.zeros((len(df),256))

In [22]:
X_input_ids.shape

(8807, 256)

In [23]:
#Generating the training data
def generate_training_data(df,ids,masks,tokenizer):
  for i,text in tqdm(enumerate(df['description'])):
    tokenized_text = tokenizer.encode_plus(
        text,
        max_length = 256,
        truncation=True,
        padding = 'max_length',
        add_special_tokens=True,
        return_tensors = 'tf'
    )

    ids[i,:] = tokenized_text.input_ids
    masks[i,:]=tokenized_text.attention_mask
  return ids,masks

In [24]:
X_input_ids,X_attn_masks = generate_training_data(df,X_input_ids,X_attn_masks,tokenizer)

0it [00:00, ?it/s]

In [25]:
X_input_ids

array([[  101.,  1249.,  1123., ...,     0.,     0.,     0.],
       [  101.,  1258.,  4905., ...,     0.,     0.,     0.],
       [  101.,  1706.,  3244., ...,     0.,     0.,     0.],
       ...,
       [  101.,  8540.,  1106., ...,     0.,     0.,     0.],
       [  101.,  1987.,  8517., ...,     0.,     0.,     0.],
       [  101.,   138., 16720., ...,     0.,     0.,     0.]])

In [26]:
labels = np.zeros((len(df),10)) #10 for number of output clases
#we will use the labels as one hot encoded vector
labels.shape

(8807, 10)

In [27]:
labels[np.arange(len(df)),df['cluster'].values]=1

In [28]:
labels

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [29]:
#data creation using tensorflow utility model
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids,X_attn_masks,labels))

In [30]:
dataset.take(1)

<TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(10,), dtype=tf.float64, name=None))>

In [31]:
#mapping function
def Descriptionmapping(input_ids,attn_masks,labels):
  return{
      'input_ids':input_ids,
      'attention_mask':attn_masks
  },labels

In [32]:
dataset = dataset.map(Descriptionmapping)

In [33]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(10,), dtype=tf.float64, name=None))>

In [34]:
#shuffling the training data to remove bias

dataset = dataset.shuffle(10000).batch(16,drop_remainder=True)


In [35]:
#splitting training and testing dataset
p=0.8
train_size = int((len(df)//16)*p)

In [36]:
train_size

440

In [37]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [38]:
from transformers import TFBertModel

In [39]:
bert_model = TFBertModel.from_pretrained('bert-base-cased')

Downloading tf_model.h5:   0%|          | 0.00/502M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [40]:
input_ids = tf.keras.layers.Input(shape=(256,),name='input_ids',dtype='int32')
attention_masks = tf.keras.layers.Input(shape=(256,),name='attention_mask',dtype='int32')

bert_embds = bert_model.bert(input_ids,attention_mask=attention_masks)[1]

intermediate_layer=tf.keras.layers.Dense(512,activation='relu',name='intermediate_layer')(bert_embds)

output_layer = tf.keras.layers.Dense(10,activation='softmax', name='output_layer')(intermediate_layer)


model = tf.keras.Model(inputs=[input_ids,attention_masks],outputs=output_layer)
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

In [41]:
optim = tf.keras.optimizers.Adam(learning_rate=1e-5,decay=1e-6)
loss_func= tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [42]:
model.compile(optimizer = optim, loss = loss_func,metrics=[acc])

In [43]:
hist = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=1
)



In [44]:
model.save('bert-model')



INFO:tensorflow:Assets written to: bert-model/assets


INFO:tensorflow:Assets written to: bert-model/assets


In [45]:
loaded_model = tf.keras.models.load_model('bert-model')

In [46]:
test_desc = 'When the CIAs top asset -- his identity known to no one -- uncovers agency secrets, he triggers a global hunt by assassins set loose by his ex-colleague.'

In [48]:
def prepare_data(test_desc,tokenizer):

    token = tokenizer.encode_plus(
        test_desc,
        max_length = 256,
        truncation=True,
        padding = 'max_length',
        add_special_tokens=True,
        return_tensors = 'tf'
    )

    return{
        'input_ids':tf.cast(token.input_ids,tf.float64),
        'attention_mask':tf.cast(token.attention_mask,tf.float64)
    }

In [50]:
tokenized_input_text = prepare_data(test_desc,tokenizer)

In [51]:
prob = loaded_model.predict(tokenized_input_text)

In [52]:
prob

array([[0.7415812 , 0.01298147, 0.00448805, 0.0139986 , 0.09678303,
        0.00245494, 0.00891277, 0.02669697, 0.01107309, 0.08102987]],
      dtype=float32)

In [53]:
output_index=np.argmax(prob[0])
i=output_index
print("Cluster %d:" %i)
for j in order_centroids[i, :5]: #printing 5 feature terms of each cluster
       print(' %s ' %terms[j])
print('--------------')

Cluster 0:
 new 
 family 
 world 
 friends 
 home 
--------------


In [55]:
test_desc2 = 'When a young woman encounters a renowned coach in a boxing gym, she shares her boxing aspirations with him and convinces him to teach her. Despite her fathers disapproval, she follows her passion.'

In [56]:
tokenized_input_text2 = prepare_data(test_desc2,tokenizer)

In [57]:
prob2 = loaded_model.predict(tokenized_input_text2)

In [58]:
prob2

array([[0.00992754, 0.0593957 , 0.07410372, 0.15581188, 0.01332013,
        0.64533037, 0.00291203, 0.01719272, 0.006454  , 0.01555189]],
      dtype=float32)

In [59]:
output_index=np.argmax(prob2[0])
i=output_index
print("Cluster %d:" %i)
for j in order_centroids[i, :5]: #printing 5 feature terms of each cluster
       print(' %s ' %terms[j])
print('--------------')

Cluster 5:
 young 
 woman 
 couple 
 love 
 boy 
--------------
