In [1]:
import pandas as pd
import tensorflow as tf
import gdown
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# Preparing the pandas dataframe

id = "1a6TppDeLhFhKso9vkYzRdPfualSRmRFf"
output = "datadummy50k_new_grouped"
gdown.download(id=id, output=output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1a6TppDeLhFhKso9vkYzRdPfualSRmRFf
To: /content/datadummy50k_new_grouped
100%|██████████| 6.90M/6.90M [00:00<00:00, 213MB/s]


'datadummy50k_new_grouped'

In [3]:
df = pd.read_csv(f"/content/{output}")

# Cleaning the unused columns
df = df.drop(df.columns[[0]], axis=1)

# Transform 'Workers' from strings into lists
df['Workers'] = df['Workers'].str.replace("[\'\[\]]","",regex=True)
df['Workers'] = df['Workers'].str.replace(", ","|",regex=True)
df['Workers'] = df['Workers'].apply(lambda s: [l for l in str(s).split('|')])
df


Unnamed: 0,Project Type,Topics,Sub Topic,Difficulty,Workers
0,ML,Time-series,LSTM,6,"[Gabriel Kheisa, Nyoman Satiya Najwa Sadha, Ch..."
1,Front End,Front End,Angular,17,"[Andi Rezal Oktavianto, Imam, Abiyyu Diora Haq..."
2,Front End,Front End,Angular,18,"[Andi Rezal Oktavianto, Imam, Abiyyu Diora Haq..."
3,ML,Computer Vision,Object Detection,16,"[Andhika Zulfikri, I Putu Ranantha Nugraha Sup..."
4,Back End,Back End,Django,15,"[Chairul Rizqi, Sandrian Yulianto, Abdullah Nu..."
...,...,...,...,...,...
49995,Back End,Back End,Express.js,10,"[Abdullah Nur Hudi, Vania Kylie, Wahyu Fauzan,..."
49996,Back End,Back End,Express.js,20,"[Abdullah Nur Hudi, Vania Kylie, Wahyu Fauzan,..."
49997,ML,Time-series,LSTM,19,"[Gabriel Kheisa, Rikip Ginanjar, Nyoman Satiya..."
49998,Back End,Back End,Django,15,"[Chairul Rizqi, Sandrian Yulianto, Abdullah Nu..."


In [4]:
# Create dictionaries
# Assuming you have a DataFrame called 'df' and you want to rename the column 'old_column' to 'new_column'
df.rename(columns={'Project Type': 'Project_Type'}, inplace=True)
df.rename(columns={'Sub Topic': 'Sub_Topic'}, inplace=True)


top_dict = dict(enumerate(df["Topics"].astype('category').cat.categories))
subtop_dict = dict(enumerate(df["Sub_Topic"].astype('category').cat.categories))
ptype_dict = dict(enumerate(df["Project_Type"].astype('category').cat.categories))

print(top_dict, subtop_dict, ptype_dict)

{0: 'Back End', 1: 'Classification & Regression', 2: 'Computer Vision', 3: 'Data Engineering', 4: 'Front End', 5: 'NLP', 6: 'Speech / Audio', 7: 'Time-series'} {0: 'ARIMA', 1: 'Angular', 2: 'Data Warehousing', 3: 'Django', 4: 'Ember.js', 5: 'Express.js', 6: 'LSTM', 7: 'Linear Regression', 8: 'Logistic Regression', 9: 'Music Information Retrieval', 10: 'Node.js', 11: 'Object Detection', 12: 'React', 13: 'Sentiment Analysis', 14: 'Speech Recognition', 15: 'Topic Modeling'} {0: 'Back End', 1: 'Front End', 2: 'ML'}


In [5]:
string_col = ['Topics', 'Sub_Topic', 'Project_Type']

# Transform columns from string into integer

for col in string_col:
  df[col] = df[col].astype('category').cat.codes

df.head()

Unnamed: 0,Project_Type,Topics,Sub_Topic,Difficulty,Workers
0,2,7,6,6,"[Gabriel Kheisa, Nyoman Satiya Najwa Sadha, Ch..."
1,1,4,1,17,"[Andi Rezal Oktavianto, Imam, Abiyyu Diora Haq..."
2,1,4,1,18,"[Andi Rezal Oktavianto, Imam, Abiyyu Diora Haq..."
3,2,2,11,16,"[Andhika Zulfikri, I Putu Ranantha Nugraha Sup..."
4,0,0,3,15,"[Chairul Rizqi, Sandrian Yulianto, Abdullah Nu..."


In [6]:
# Transform other columns into strings

for col in string_col:
  df[col] = df[col].astype('category')

print(df.dtypes)
df.head()

Project_Type    category
Topics          category
Sub_Topic       category
Difficulty         int64
Workers           object
dtype: object


Unnamed: 0,Project_Type,Topics,Sub_Topic,Difficulty,Workers
0,2,7,6,6,"[Gabriel Kheisa, Nyoman Satiya Najwa Sadha, Ch..."
1,1,4,1,17,"[Andi Rezal Oktavianto, Imam, Abiyyu Diora Haq..."
2,1,4,1,18,"[Andi Rezal Oktavianto, Imam, Abiyyu Diora Haq..."
3,2,2,11,16,"[Andhika Zulfikri, I Putu Ranantha Nugraha Sup..."
4,0,0,3,15,"[Chairul Rizqi, Sandrian Yulianto, Abdullah Nu..."


In [7]:
# Creating list of labels
labels_list = df['Workers']
labels_list = list(labels_list)
mlb = MultiLabelBinarizer()
mlb.fit(labels_list)

N_LABELS = len(mlb.classes_)
for (i, label) in enumerate(mlb.classes_):
    print("{}. {}".format(i, label))

0. Abdullah Nur Hudi
1. Abiyyu Diora Haqi
2. Alvin Tan
3. Andhika Zulfikri
4. Andi Rezal Oktavianto
5. Azis Sofyanto
6. Bagja Kurniadi
7. Chairul Rizqi
8. Christopher Kristianto
9. Farel Eden
10. Gabriel Kheisa
11. I Putu Ranantha Nugraha Suparta
12. Iga Narendra Pramawijaya
13. Imam
14. Muhammad Raden Syawali Akbar
15. Nyoman Satiya Najwa Sadha
16. Putu Gede Agung Karna Sampalan
17. Rikip Ginanjar
18. Sandrian Yulianto
19. Sarah Sema Khairunisa
20. Suci Rahmadani
21. Vania Kylie
22. Wahyu Fauzan


In [8]:
testpredict = df.copy()
testpredict

Unnamed: 0,Project_Type,Topics,Sub_Topic,Difficulty,Workers
0,2,7,6,6,"[Gabriel Kheisa, Nyoman Satiya Najwa Sadha, Ch..."
1,1,4,1,17,"[Andi Rezal Oktavianto, Imam, Abiyyu Diora Haq..."
2,1,4,1,18,"[Andi Rezal Oktavianto, Imam, Abiyyu Diora Haq..."
3,2,2,11,16,"[Andhika Zulfikri, I Putu Ranantha Nugraha Sup..."
4,0,0,3,15,"[Chairul Rizqi, Sandrian Yulianto, Abdullah Nu..."
...,...,...,...,...,...
49995,0,0,5,10,"[Abdullah Nur Hudi, Vania Kylie, Wahyu Fauzan,..."
49996,0,0,5,20,"[Abdullah Nur Hudi, Vania Kylie, Wahyu Fauzan,..."
49997,2,7,6,19,"[Gabriel Kheisa, Rikip Ginanjar, Nyoman Satiya..."
49998,0,0,3,15,"[Chairul Rizqi, Sandrian Yulianto, Abdullah Nu..."


In [9]:
testpredict = testpredict[40:47]
testpredict

Unnamed: 0,Project_Type,Topics,Sub_Topic,Difficulty,Workers
40,0,0,10,11,"[Sandrian Yulianto, Wahyu Fauzan, Abdullah Nur..."
41,2,3,2,12,"[Farel Eden, Andhika Zulfikri, Andhika Zulfikr..."
42,2,5,13,17,"[Bagja Kurniadi, Alvin Tan, Sarah Sema Khairun..."
43,0,0,5,19,"[Wahyu Fauzan, Abdullah Nur Hudi, Vania Kylie,..."
44,2,5,13,21,"[Bagja Kurniadi, Alvin Tan, Sarah Sema Khairun..."
45,2,3,2,14,"[Farel Eden, Gabriel Kheisa, Christopher Krist..."
46,1,4,12,9,"[Abiyyu Diora Haqi, Iga Narendra Pramawijaya]"


In [10]:
#splitting the data here 

train, test = train_test_split(df, test_size=0.05)
train, val = train_test_split(train, test_size=0.05)

In [11]:
# checking the size

train_size = len(train)
val_size = len(val)
test_size = len(test)

print("Number of examples in the train set:", train_size)
print("Number of examples in the validation set:", val_size)
print("Number of examples in the test set:", test_size)


Number of examples in the train set: 45125
Number of examples in the validation set: 2375
Number of examples in the test set: 2500


In [12]:
# Making the labels

train_labels = train.pop('Workers')
val_labels = val.pop('Workers')
test_labels = test.pop('Workers')
testpredict_split_labels = testpredict.pop('Workers')

train_labels = list(train_labels)
val_labels = list(val_labels)
test_labels = list(test_labels)
testpredict_labels = list(testpredict_split_labels)


train_labels2 = mlb.transform(train_labels)
val_labels2 = mlb.transform(val_labels)
test_labels2 = mlb.transform(test_labels)
testpredict_labels2 = mlb.transform(testpredict_labels)

In [13]:
#train_labels2

In [14]:
#labels_list

In [15]:
# Custom F1Score metric
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.precision = tf.keras.metrics.Precision()
        self.recall = tf.keras.metrics.Recall()

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)

    def result(self):
        precision = self.precision.result()
        recall = self.recall.result()
        f1_score = 2 * ((precision * recall) / (precision + recall + tf.keras.backend.epsilon()))
        return f1_score

    def reset_states(self):
        self.precision.reset_states()
        self.recall.reset_states()


In [16]:
# get the model
def get_model(n_inputs, n_outputs):
	model = tf.keras.Sequential()
	model.add(tf.keras.layers.Dense(1000, input_dim=n_inputs,use_bias=True, kernel_initializer='he_uniform', activation='relu'))
	model.add(tf.keras.layers.Dense(23, use_bias=True, activation='sigmoid'))
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics='accuracy')
	return model

In [17]:
n_inputs, n_outputs = train.shape[1], train_labels2.shape[1]
# get model
model = get_model(n_inputs, n_outputs)


In [18]:
# using val set
model.fit(x=train, y=train_labels2, validation_data=(val, val_labels2), epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa97e5e6d40>

**Added by Seno**

In [19]:
# Evaluating the model using the test set
loss, accuracy = model.evaluate(x=testpredict, y=testpredict_labels2)
print("Accuracy", accuracy)

Accuracy 0.2857142984867096


In [20]:
#Get key of a dictionary
def get_key(d, val):
    return [k for k, v in d.items() if v == val]

['ML', 'Front End', 'Back End']
['Time-series', 'Speech / Audio', 'NLP', 'Data Engineering', 'Computer Vision', 'Classification & Regression']
{'Time-series': ['LSTM', 'ARIMA'], 'Speech / Audio': ['Speech Recognition', 'Music Information Retrieval'], 'NLP': ['Topic Modeling', 'Sentiment Analysis'], 'Data Engineering': ['Data Warehousing'], 'Computer Vision': ['Object Detection'], 'Classification & Regression': ['Logistic Regression', 'Linear Regression'], 'Front End': ['React', 'Ember.js', 'Angular'], 'Back End': ['Node.js', 'Express.js', 'Django']}


In [21]:
# Testing predictions
testype = 'ML' #Input here
testopic = 'NLP' #Input here
testopicsub = 'Django' #Input here
testdif = 10 #Input here

testX = [get_key(ptype_dict, testype)[0], get_key(top_dict, testopic)[0],
         get_key(subtop_dict, testopicsub)[0], testdif]
testX = np.asarray([testX])
yhat = model.predict(testX)[0]

# Converting the prediction into dataframe
predf = pd.DataFrame(yhat, index=mlb.classes_)
predf = predf.multiply(100).round(0).sort_values(by=0, ascending=False)
predf = predf[predf[0] >= 1]
predf



Unnamed: 0,0
Gabriel Kheisa,90.0
Rikip Ginanjar,88.0
Nyoman Satiya Najwa Sadha,75.0
Farel Eden,49.0
Bagja Kurniadi,38.0
Alvin Tan,31.0
Sarah Sema Khairunisa,22.0
Andhika Zulfikri,8.0
Azis Sofyanto,5.0
Suci Rahmadani,5.0


In [22]:
activetalents = ['Nyoman Satiya Najwa Sadha', 'Rikip Ginanjar', 'I Putu Ranantha Nugraha Suparta', 'Putu Gede Agung Karna Sampalan',
                 'Sarah Sema Khairunisa', 'Christopher Kristianto', 'Azis Sofyanto']

besteam = predf.filter(items=activetalents, axis=0)
besteam = besteam.sort_values(by=0, ascending=False).head(5)
besteam = besteam.set_axis(['Prediction'], axis='columns')

print(besteam)

                                 Prediction
Rikip Ginanjar                         88.0
Nyoman Satiya Najwa Sadha              75.0
Sarah Sema Khairunisa                  22.0
Azis Sofyanto                           5.0
I Putu Ranantha Nugraha Suparta         1.0


*Coret-coretan Daffa*

In [23]:
import tensorflow as tf

model.save('model')

# Load the model
model = tf.keras.models.load_model('/content/model', custom_objects= {'f1_score': F1Score()})

# Save the model in TensorFlow SavedModel format




In [24]:
model = tf.keras.models.load_model('/content/model')

In [25]:
model = tf.keras.models.load_model('/content/model')

# Compile the model
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics='accuracy'
)

In [26]:
model.evaluate(x=testpredict, y=testpredict_labels2)



[0.1590304672718048, 0.2857142984867096]

In [27]:
model.save('model.h5')

In [28]:
strez = df.loc[:10,:]
strez = strez.drop(labels=[1,9,6])
strez

# Creating list of labels
labels_list_2 = strez['Workers']
labels_list_2 = list(labels_list_2)
mlb = MultiLabelBinarizer()
mlb.fit(labels_list_2)

N_LABELS_2 = len(mlb.classes_)
for (i, label) in enumerate(mlb.classes_):
    print("{}. {}".format(i, label))

# Making the labels

labelstrez = strez.pop('Workers')
labelstrez_l = list(labelstrez)
labelstrezz = mlb.transform(labelstrez_l)

print(strez, labelstrezz)

0. Abdullah Nur Hudi
1. Abiyyu Diora Haqi
2. Alvin Tan
3. Andhika Zulfikri
4. Andi Rezal Oktavianto
5. Azis Sofyanto
6. Bagja Kurniadi
7. Chairul Rizqi
8. Christopher Kristianto
9. Farel Eden
10. Gabriel Kheisa
11. I Putu Ranantha Nugraha Suparta
12. Iga Narendra Pramawijaya
13. Imam
14. Muhammad Raden Syawali Akbar
15. Nyoman Satiya Najwa Sadha
16. Putu Gede Agung Karna Sampalan
17. Rikip Ginanjar
18. Sandrian Yulianto
19. Sarah Sema Khairunisa
20. Suci Rahmadani
21. Vania Kylie
22. Wahyu Fauzan
   Project_Type Topics Sub_Topic  Difficulty
0             2      7         6           6
2             1      4         1          18
3             2      2        11          16
4             0      0         3          15
5             2      1         7          13
7             2      3         2          18
8             2      5        13          19
10            1      4         1          21 [[0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 

In [29]:
import tensorflow as tf

# Load pre-trained base model
base_model = tf.keras.models.load_model('/content/model.h5')

# Freeze base model layers
for layer in base_model.layers:
    layer.trainable = False

# Create new model
model = tf.keras.Sequential([
    base_model,
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1000, activation='relu'),
    tf.keras.layers.Dense(N_LABELS_2, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[F1Score()])


In [30]:
print(labelstrezz)

[[0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0]
 [0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1]
 [0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0]
 [0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0]]


In [31]:
model.fit(x=strez, y=labelstrezz, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


  m.reset_state()


Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa975eb9f00>

In [32]:
newX = np.asarray(strez)
yhat = model.predict(newX)
print(yhat[0])

#newX = np.asarray(df[:10])
#yhat = model.predict(newX)

#print(yhat)

[0.21779197 0.28761163 0.32819885 0.26344538 0.2869926  0.30234674
 0.25467244 0.22431633 0.4192899  0.24999294 0.40900952 0.28971618
 0.2789164  0.28195295 0.28228068 0.28995228 0.22556609 0.26270542
 0.2502333  0.33679724 0.27731735 0.2399235  0.23821403]


In [33]:
strezjing = df.loc[:10,:]
strezjing = strezjing.drop(labels=[1,9,6])
strezjing

Unnamed: 0,Project_Type,Topics,Sub_Topic,Difficulty,Workers
0,2,7,6,6,"[Gabriel Kheisa, Nyoman Satiya Najwa Sadha, Ch..."
2,1,4,1,18,"[Andi Rezal Oktavianto, Imam, Abiyyu Diora Haq..."
3,2,2,11,16,"[Andhika Zulfikri, I Putu Ranantha Nugraha Sup..."
4,0,0,3,15,"[Chairul Rizqi, Sandrian Yulianto, Abdullah Nu..."
5,2,1,7,13,"[Azis Sofyanto, Suci Rahmadani, I Putu Rananth..."
7,2,3,2,18,"[Farel Eden, Christopher Kristianto, Christoph..."
8,2,5,13,19,"[Alvin Tan, Bagja Kurniadi, Sarah Sema Khairun..."
10,1,4,1,21,"[Andi Rezal Oktavianto, Imam, Abiyyu Diora Haq..."


In [34]:
# Converting the prediction into dataframe
predf = pd.DataFrame(yhat[0], index=mlb.classes_)
predf = predf.multiply(100).round(0).sort_values(by=0, ascending=False)
predf = predf[predf[0] >= 1].head(3)
predf

Unnamed: 0,0
Christopher Kristianto,42.0
Gabriel Kheisa,41.0
Sarah Sema Khairunisa,34.0


In [35]:
# using test set
#model.fit(x=train, y=train_labels2, validation_data=(test, test_labels2), epochs=20, verbose=1)

In [36]:
#model.fit(df, labels_bin, verbose=1, epochs=20)

In [37]:
# fit the model on all data
#model.fit(df, labels_bin, verbose=1, epochs=100)

In [38]:
# ga ngerti caranya bre evaluatenya
#model.evaluate(train, validation=test)

In [39]:
#model2 = get_model(n_inputs, n_outputs)
#model2.fit()

In [40]:
#model.summary()

In [41]:
#print(np.asarray(df[:10]))

In [42]:
# make a prediction for new data
#newX = np.asarray(df[:10])
#yhat = model.predict(newX)

#print(yhat)

In [43]:
#predf = pd.DataFrame(yhat[0], index=mlb.classes_)
#predf.head()

In [44]:
#predf = predf.multiply(100).round(0).sort_values(by=0, ascending=False)

In [45]:
#print(predf)

In [46]:
#hasil = predf[0:5]

#hasil