<a href="https://colab.research.google.com/github/Shivam-316/Semi-Supervised-Classifier---NLP/blob/master/Semi_supervised_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Loading Files and Preprocessing Text



In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import tensorflow as tf
import re
import math
import io
import scipy

In [None]:
example_technical_skills = pd.read_csv('/content/Example_Technical_Skills.csv')

In [None]:
example_technical_skills.head()

Unnamed: 0,Technology Skills
0,SAP Fiori Developer
1,Oracle Instance Management & Strategy
2,Boomi Master Data Management
3,Digital Manufacturing on Cloud ( DMC)
4,DevOps


In [None]:
raw_skills = pd.read_csv('/content/Raw_Skills_Dataset.csv')

In [None]:
raw_skills.head()

Unnamed: 0,RAW DATA
0,What ifs
1,seniority
2,familiarity
3,functionalities
4,Lambdas


In [None]:
example_technical_skills['Technology Skills']

0                        SAP Fiori Developer
1      Oracle Instance Management & Strategy
2               Boomi Master Data Management
3      Digital Manufacturing on Cloud ( DMC)
4                                     DevOps
                       ...                  
974          Oracle Cloud Revenue Management
975         Oracle EBS Grid Contral Mgt Pack
976           Amazon Elastic MapReduce (EMR)
977                              Apache Kudu
978                               Oracle ESB
Name: Technology Skills, Length: 979, dtype: object

In [None]:
def preprocess(skill, pattern):
  skill = skill.lower()
  skill = re.sub(pattern, ' ', skill)
  skill = skill.split(" ")[0]
  return skill

In [None]:
askills = list(raw_skills['RAW DATA'][:])
tskills = list(example_technical_skills['Technology Skills'][:])
skills = []
labels = []
pattern = re.compile(r"[^a-zA-Z]")

In [None]:
for skill in askills:
  processed_skill = preprocess(skill, pattern)
  if len(processed_skill) < 3 or processed_skill in skills:
    continue
  skills.append(processed_skill)
  if skill in tskills:
    labels.append(1)
  else:
    labels.append(0)

In [None]:
dataset = pd.DataFrame({
    'skill' : skills,
    'isTechnical' : labels
})

In [None]:
known_tech_skills = list(dataset[dataset['isTechnical']== 1]['skill'])

In [None]:
known_tech_skills

['mysql',
 'scss',
 'eac',
 'openshift',
 'netsuite',
 'ethereum',
 'ftp',
 'onestream',
 'consul',
 'force',
 'phantom',
 'magento',
 'activemq',
 'postgis',
 'drm',
 'katalon']

In [None]:
known_tech_skills.extend(['python', 'java', 'react', 'javascript'])

In [None]:
dataset[dataset['isTechnical'] == 0].count()

skill          4036
isTechnical    4036
dtype: int64

In [None]:
vectorizer = tf.keras.layers.TextVectorization()
text_ds = tf.data.Dataset.from_tensor_slices(dataset['skill'])
vectorizer.adapt(text_ds)

In [None]:
model = tf.keras.models.Sequential([
  vectorizer,
  tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIMS, input_length=1, name="embedding"),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(16, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
path = '/content/logs'
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=path)
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

### REPEAT

In [None]:
tf_ds = tf.data.Dataset.from_tensor_slices((dataset['skill'], dataset['isTechnical']))

In [None]:
tf_ds.cardinality()

<tf.Tensor: shape=(), dtype=int64, numpy=4052>

In [None]:
BATCH = 64
BUFFER_SIZE = tf_ds.cardinality().numpy()
EMBEDDING_DIMS = 128

In [None]:
tf_ds = tf_ds.batch(BATCH).shuffle(BUFFER_SIZE).cache().prefetch(tf.data.AUTOTUNE)

In [None]:
test_size = int(tf_ds.cardinality().numpy() * 15 / 100)
test_ds = tf_ds.take(test_size)
train_ds = tf_ds.skip(test_size)

In [None]:
VOCAB_SIZE = vectorizer.vocabulary_size()

In [None]:
VOCAB_SIZE

4054

In [None]:
model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=5,
    callbacks=[tensorboard_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fd65fbf2510>

In [None]:
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorizer.get_vocabulary()

In [None]:
def cal_cosine_similarity(vector, embeddings):
  vector = tf.expand_dims(vector, 0)
  similarities = tf.losses.cosine_similarity(vector, embeddings)
  return similarities

In [None]:
dataset['isTechnical'] = 0

In [None]:
new_known_tech_skills = []
for idx, vector in enumerate(weights):
  sim = cal_cosine_similarity(vector, weights)
  top_3_sim = tf.argsort(sim)[:3]
  if vocab[idx] in known_tech_skills:
    for dissimilarity, skill in zip(tf.gather(sim, top_3_sim), tf.gather(vocab, top_3_sim)):
        print(dissimilarity, skill)
        dataset.loc[dataset['skill'] == skill, 'isTechnical'] = 1
        new_known_tech_skills.append(skill)
    print("\n")
known_tech_skills = new_known_tech_skills

tf.Tensor(-1.0, shape=(), dtype=float32) tf.Tensor(b'wireline', shape=(), dtype=string)
tf.Tensor(-0.62640214, shape=(), dtype=float32) tf.Tensor(b'consul', shape=(), dtype=string)
tf.Tensor(-0.5740413, shape=(), dtype=float32) tf.Tensor(b'force', shape=(), dtype=string)


tf.Tensor(-1.0, shape=(), dtype=float32) tf.Tensor(b'valid', shape=(), dtype=string)
tf.Tensor(-0.61955714, shape=(), dtype=float32) tf.Tensor(b'evolving', shape=(), dtype=string)
tf.Tensor(-0.6190408, shape=(), dtype=float32) tf.Tensor(b'eac', shape=(), dtype=string)


tf.Tensor(-0.99999994, shape=(), dtype=float32) tf.Tensor(b'tight', shape=(), dtype=string)
tf.Tensor(-0.732775, shape=(), dtype=float32) tf.Tensor(b'tam', shape=(), dtype=string)
tf.Tensor(-0.7295621, shape=(), dtype=float32) tf.Tensor(b'iic', shape=(), dtype=string)


tf.Tensor(-0.9999999, shape=(), dtype=float32) tf.Tensor(b'streaming', shape=(), dtype=string)
tf.Tensor(-0.48486957, shape=(), dtype=float32) tf.Tensor(b'compute', shape=(), dtype=str

In [None]:
dataset[dataset['isTechnical'] == 0][:50]

Unnamed: 0,skill,isTechnical
0,what,0
1,seniority,0
2,familiarity,0
3,functionalities,0
4,lambdas,0
6,object,0
7,relational,0
8,sql,0
9,orm,0
10,jpa,0


In [1]:
error = (3/51) * 100
print(f"Error %age after 3 Iterations: {error}")

Error %age after 3 Iterations: 5.88235294117647


### Download Embeddings

In [None]:
embedding_path = '/content/vectors.tsv'
metadata_path = '/content/metadata.tsv'

In [None]:
out_v = io.open(embedding_path, 'w', encoding='utf-8')
out_m = io.open(metadata_path, 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [None]:
try:
  from google.colab import files
  files.download(embedding_path)
  files.download(metadata_path)
except Exception:
  pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>