In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD
import stanza
from sklearn.decomposition import PCA

# Part 1: Load data

In [2]:
df = pd.read_csv('mix_combined.csv')
df.head()

Unnamed: 0,user_id,languages,num of languages,summary,education,num of education,experiences,num of experiences,skill_k50_accounting / financial reporting / auditing,skill_k50_analysis / financial analysis / finance,...,language_taiwanese,language_tamil,language_telugu,language_thai,language_tigrinya,language_turkish,language_ukrainian,language_urdu,language_vietnamese,combined_text
0,1033137,"['english', 'spanish']",2.0,Director of biopharma equity research and publ...,NYU Stern School of Business\nMaster of Busine...,3.0,Evercore : Director - Biotechnology & Pharmace...,6.0,1,1,...,0,0,0,0,0,0,0,0,0,summary: Director of biopharma equity research...
1,1098586,[],0.0,blank,Fordham Gabelli School of Business\nMaster of ...,1.0,- : Research Analyst\nNone.-.\nNo description\...,3.0,0,1,...,0,0,0,0,0,0,0,0,0,summary: blank;education: Fordham Gabelli Scho...
2,1115736,[],0.0,I am the CEO of JLab. We are the fastest growi...,University of Oklahoma\nNA : NA\n1995-1999\n\n...,2.0,JLab Audio : President\n2011.11-.\nNo descript...,7.0,0,0,...,0,0,0,0,0,0,0,0,0,summary: I am the CEO of JLab. We are the fast...
3,1341457,[],0.0,I offer ~20 years’ experience on the evolution...,University of Wisconsin-Madison\nBBA : Finance...,3.0,SLR Capital Partners : Partner\n2023.1-.\nNo d...,6.0,0,1,...,0,0,0,0,0,0,0,0,0,summary: I offer ~20 years’ experience on the ...
4,1501168,[],0.0,We specialize in providing long-term financial...,University of Arkansas\nBachelor of Science (B...,1.0,"Boston Mountain Money Management, Inc. : Princ...",4.0,1,1,...,0,0,0,0,0,0,0,0,0,summary: We specialize in providing long-term ...


# Part 2: Preprocess combined_text

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stanza.download('en', processors='lemma')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tina\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tina\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-02 09:25:39 INFO: Downloaded file to C:\Users\Tina\stanza_resources\resources.json
2024-05-02 09:25:39 INFO: Downloading these customized packages for language: en (English)...
| Processor | Package           |
---------------------------------
| lemma     | combined_nocharlm |

2024-05-02 09:25:39 INFO: File exists: C:\Users\Tina\stanza_resources\en\lemma\combined_nocharlm.pt
2024-05-02 09:25:39 INFO: Finished downloading models and saved to C:\Users\Tina\stanza_resources


In [4]:
# Didn't do stemming since it would cost overstemming problem.
#lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()    
    # Remove URLs, mentions, and hashtags
    text = re.sub(r'http\S+|@\S+|#\S+', '', text)
    # Remove punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Perform lemmatization
    doc = nlp(text)
    lemmatized_tokens = [word.lemma for sent in doc.sentences for word in sent.words]
    #lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Join the preprocessed tokens back into a string
    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text

nlp = stanza.Pipeline('en', processors='tokenize,lemma')

2024-05-02 09:25:39 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-02 09:25:39 INFO: Downloaded file to C:\Users\Tina\stanza_resources\resources.json
2024-05-02 09:25:39 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| lemma     | combined_nocharlm |

2024-05-02 09:25:39 INFO: Using device: cpu
2024-05-02 09:25:39 INFO: Loading: tokenize
2024-05-02 09:25:41 INFO: Loading: mwt
2024-05-02 09:25:41 INFO: Loading: lemma
2024-05-02 09:25:41 INFO: Done loading processors!


In [5]:
df['preprocessed_combined_text']=df['combined_text'].apply(preprocess_text)

In [6]:
df.loc[0, 'preprocessed_combined_text']
#df.loc[0,'combined_text']

'summary director of biopharma equity research and publishing analyst on the rank sellside team on wall street biotechpharma industry veteran with prove success at analyze clinical trial assess drug market potential and at cultivate relationship with physicianthought leader in order to drive strategic business objective extremely knowledgeable about the fda drug approval process and other key driver of the biopharma industry education nyu stern school of business master of business administration mba finance management fev fundao getulio vargas to do business in brazil rutger university pharmd pharmacy nonenoneexperiencence evercore director biotechnology pharmaceuticalsmajor equity research no description evercore vice president biotechnology pharmaceuticalsmajor equity research cover biotech major pharma and specialty pharma company under head analyst umer raffat and previously under dr mark schoenebaum ii rank in large cap biotech in large cap pharma in specialty pharma ii rank in l

## Bag-of-word

In [12]:
vectorizer_limited = CountVectorizer(max_features=300, ngram_range=(1, 2))
bow_matrix_limited = vectorizer_limited.fit_transform(df['preprocessed_combined_text'])
bow_df_limited = pd.DataFrame(bow_matrix_limited.toarray(), columns=vectorizer_limited.get_feature_names_out())

whole_bow_df_limited = pd.concat([bow_df_limited,df],axis=1)

print('The number of columns in df is:', df.shape[1], \
      'The number of columns in bow_df is:', bow_df_limited.shape[1],\
      'The number of columns in whole_bow_df is:', whole_bow_df_limited.shape[1])

The number of columns in df is: 119 The number of columns in bow_df is: 300 The number of columns in whole_bow_df is: 419


In [13]:
vectorizer = CountVectorizer(ngram_range=(1, 1))
bow_matrix = vectorizer.fit_transform(df['preprocessed_combined_text'])
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

whole_bow_df = pd.concat([bow_df,df],axis=1)

print('The number of columns in df is:', df.shape[1], \
      'The number of columns in bow_df is:', bow_df.shape[1],\
      'The number of columns in whole_bow_df is:', whole_bow_df.shape[1])

The number of columns in df is: 119 The number of columns in bow_df is: 37036 The number of columns in whole_bow_df is: 37155


## TF-IDF

In [14]:
tfidf_transformer = TfidfTransformer()
tfidf_matrix_limited = tfidf_transformer.fit_transform(bow_matrix_limited)
tfidf_df_limited = pd.DataFrame(tfidf_matrix_limited.toarray(), columns=vectorizer_limited.get_feature_names_out())

whole_tfidf_df_limited = pd.concat([tfidf_df_limited,df],axis=1)
print('The number of columns in df is:', df.shape[1], \
      'The number of columns in tfidf_df is:', tfidf_df_limited.shape[1],\
      'The number of columns in whole_tfidf_df is:', whole_tfidf_df_limited.shape[1])

The number of columns in df is: 119 The number of columns in tfidf_df is: 300 The number of columns in whole_tfidf_df is: 419


In [15]:
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(bow_matrix)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

whole_tfidf_df = pd.concat([tfidf_df,df],axis=1)
print('The number of columns in df is:', df.shape[1], \
      'The number of columns in tfidf_df is:', tfidf_df.shape[1],\
      'The number of columns in whole_tfidf_df is:', whole_tfidf_df.shape[1])

The number of columns in df is: 119 The number of columns in tfidf_df is: 37036 The number of columns in whole_tfidf_df is: 37155


# Part 3: Reduce dimension

In [16]:
selected_columns = whole_bow_df.columns[whole_bow_df.columns.str.startswith('skill_k50')]
bow_df_x = whole_bow_df.drop(selected_columns, axis=1)
# drop string columns in whole_bow_df
bow_df_x=bow_df_x.drop(['combined_text','preprocessed_combined_text','languages','summary','education','experiences','user_id'],axis=1)
bow_df_y = whole_bow_df.loc[:, selected_columns]

In [17]:
selected_columns = whole_tfidf_df.columns[whole_bow_df.columns.str.startswith('skill_k50')]
tfidf_df_x = whole_tfidf_df.drop(selected_columns, axis=1)
# drop string columns in whole_bow_df
tfidf_df_x=tfidf_df_x.drop(['combined_text','preprocessed_combined_text','languages','summary','education','experiences','user_id'],axis=1)
tfidf_df_y = whole_tfidf_df.loc[:, selected_columns]

## Truncated SVD

In [18]:
trunc = TruncatedSVD(n_components=200)

bow_array_x = bow_df_x.values
# Fit and transform the bow_matrix
bow_array_trunc = trunc.fit_transform(bow_array_x)

new_column_names = [str(i + 1) for i in range(trunc.n_components)]
# Convert the transformed matrix to a DataFrame
bow_df_trunc_x = pd.DataFrame(bow_array_trunc, columns=new_column_names)
bow_df_trunc_x.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
0,39.77675,-1.046949,1.115704,-1.938133,3.626245,6.694988,3.449337,7.53449,5.340878,-7.120664,...,0.606434,0.434857,-1.025383,-0.864405,-1.497541,-0.841022,1.82151,-1.437396,-2.397008,1.369091
1,2.295118,-5.172061,-0.068558,2.260915,0.251361,0.09368,0.797344,0.682476,-0.273334,0.149566,...,-0.096213,-0.158763,0.066178,0.10925,0.204426,-0.084276,0.245752,-0.261273,0.167872,0.083225
2,11.063633,-8.884498,-3.281833,7.204811,0.291344,-0.113647,0.112804,2.100417,1.291166,2.032216,...,0.4046,0.305402,-0.046361,0.108415,-0.448139,0.363067,0.382519,-0.513227,-0.457132,-0.172559
3,39.145283,-9.515285,-6.276065,-6.639078,-10.110341,10.676847,-4.411591,3.753459,-4.915163,-1.210364,...,-0.732387,-1.597697,-3.416819,-0.450448,0.004869,0.710006,-0.881024,0.830577,-0.525949,-2.054429
4,14.998849,-0.676625,1.804279,0.46194,-1.857784,4.125641,-2.008542,-1.76275,-0.547195,0.480726,...,0.594913,-0.689621,-0.084036,-0.073081,-0.500895,-0.508593,-0.368732,0.133467,0.166583,-0.32379


In [19]:
# Fit and transform the bow_matrix
tfidf_array_x = tfidf_df_x.values

tfidf_matrix_trunc = trunc.fit_transform(tfidf_array_x)

new_column_names = [str(i + 1) for i in range(trunc.n_components)]

# Convert the transformed matrix to a DataFrame
tfidf_df_trunc_x = pd.DataFrame(tfidf_matrix_trunc, columns=new_column_names)

## PCA

In [20]:
# Create an instance of PCA
pca = PCA(n_components=200)

bow_array_x = bow_df_x.values
# Fit and transform the bow_matrix
bow_array_pca = pca.fit_transform(bow_array_x)

# Create a list of column names as 1, 2, 3, ..., n_components
new_column_names = [str(i + 1) for i in range(pca.n_components)]

# Convert the transformed matrix to a DataFrame
bow_df_pca_x = pd.DataFrame(bow_array_pca, columns=new_column_names)

In [21]:
# Create an instance of PCA

tfidf_array_x = tfidf_df_x.values
# Fit and transform the bow_matrix
tfidf_array_pca = pca.fit_transform(tfidf_array_x)

# Create a list of column names as 1, 2, 3, ..., n_components
new_column_names = [str(i + 1) for i in range(pca.n_components)]

# Convert the transformed matrix to a DataFrame
tfidf_df_pca_x = pd.DataFrame(tfidf_array_pca, columns=new_column_names)

# Part 4: Machine learning models

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, accuracy_score
import lightgbm as lgb

In [23]:
selected_columns = whole_bow_df_limited.columns[whole_bow_df_limited.columns.str.startswith('skill_k50')]
bow_df_x_limited = whole_bow_df_limited.drop(selected_columns, axis=1)
# drop string columns in whole_bow_df
bow_df_x_limited=bow_df_x_limited.drop(['combined_text','preprocessed_combined_text','languages','summary','education','experiences','user_id'],axis=1)
bow_df_y_limited = whole_bow_df_limited.loc[:, selected_columns]

In [25]:
selected_columns = whole_tfidf_df_limited.columns[whole_bow_df_limited.columns.str.startswith('skill_k50')]
tfidf_df_x_limited = whole_tfidf_df_limited.drop(selected_columns, axis=1)
# drop string columns in whole_bow_df
tfidf_df_x_limited=tfidf_df_x_limited.drop(['combined_text','preprocessed_combined_text','languages','summary','education','experiences','user_id'],axis=1)
tfidf_df_y_limited = whole_tfidf_df_limited.loc[:, selected_columns]

## 1. Random Forest+bow+truncated svd

In [26]:
X_train, X_test, y_train, y_test = train_test_split(bow_df_trunc_x, bow_df_y, test_size=0.2, random_state=42)

model_RF = RandomForestClassifier()
multi_target_RF = MultiOutputClassifier(model_RF, n_jobs=-1)

multi_target_RF.fit(X_train, y_train)

y_pred=multi_target_RF.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision, recall, and F1-score
print(classification_report(y_test, y_pred, target_names=y_test.columns))

Accuracy: 0.18
                                                                                       precision    recall  f1-score   support

                                skill_k50_accounting / financial reporting / auditing       0.00      0.00      0.00        81
                                    skill_k50_analysis / financial analysis / finance       0.90      1.00      0.95       613
                              skill_k50_autocad / solidworks / mechanical engineering       0.00      0.00      0.00         8
                                               skill_k50_banking / insurance / credit       1.00      0.01      0.02       115
                                  skill_k50_biotechnology / life sciences / chemistry       1.00      0.03      0.05        39
                        skill_k50_business analysis / change management / integration       0.83      0.05      0.10        97
             skill_k50_coaching / leadership development / organizational development       0.0

  _warn_prf(average, modifier, msg_start, len(result))


## 2. Random Forest+bow+PCA

In [27]:
X_train, X_test, y_train, y_test = train_test_split(bow_df_pca_x, bow_df_y, test_size=0.2, random_state=42)

model_RF = RandomForestClassifier()
multi_target_RF = MultiOutputClassifier(model_RF, n_jobs=-1)

multi_target_RF.fit(X_train, y_train)

y_pred=multi_target_RF.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision, recall, and F1-score
print(classification_report(y_test, y_pred, target_names=y_test.columns))

Accuracy: 0.17
                                                                                       precision    recall  f1-score   support

                                skill_k50_accounting / financial reporting / auditing       0.75      0.04      0.07        81
                                    skill_k50_analysis / financial analysis / finance       0.91      1.00      0.95       613
                              skill_k50_autocad / solidworks / mechanical engineering       0.00      0.00      0.00         8
                                               skill_k50_banking / insurance / credit       0.33      0.01      0.02       115
                                  skill_k50_biotechnology / life sciences / chemistry       0.00      0.00      0.00        39
                        skill_k50_business analysis / change management / integration       0.17      0.01      0.02        97
             skill_k50_coaching / leadership development / organizational development       0.0

  _warn_prf(average, modifier, msg_start, len(result))


## 3. Random Forest+TF-IDF+truncated svd

In [28]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_df_trunc_x, tfidf_df_y, test_size=0.2, random_state=42)

model_RF = RandomForestClassifier()
multi_target_RF = MultiOutputClassifier(model_RF, n_jobs=-1)

multi_target_RF.fit(X_train, y_train)

y_pred=multi_target_RF.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision, recall, and F1-score
print(classification_report(y_test, y_pred, target_names=y_test.columns))

Accuracy: 0.18
                                                                                       precision    recall  f1-score   support

                                skill_k50_accounting / financial reporting / auditing       1.00      0.01      0.02        81
                                    skill_k50_analysis / financial analysis / finance       0.91      1.00      0.95       613
                              skill_k50_autocad / solidworks / mechanical engineering       0.00      0.00      0.00         8
                                               skill_k50_banking / insurance / credit       1.00      0.04      0.08       115
                                  skill_k50_biotechnology / life sciences / chemistry       0.90      0.23      0.37        39
                        skill_k50_business analysis / change management / integration       0.60      0.03      0.06        97
             skill_k50_coaching / leadership development / organizational development       0.0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 4. Random Forest+TF-IDF+PCA

In [29]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_df_pca_x, tfidf_df_y, test_size=0.2, random_state=42)

model_RF = RandomForestClassifier()
multi_target_RF = MultiOutputClassifier(model_RF, n_jobs=-1)

multi_target_RF.fit(X_train, y_train)

y_pred=multi_target_RF.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision, recall, and F1-score
print(classification_report(y_test, y_pred, target_names=y_test.columns))

Accuracy: 0.18
                                                                                       precision    recall  f1-score   support

                                skill_k50_accounting / financial reporting / auditing       1.00      0.02      0.05        81
                                    skill_k50_analysis / financial analysis / finance       0.91      1.00      0.95       613
                              skill_k50_autocad / solidworks / mechanical engineering       0.00      0.00      0.00         8
                                               skill_k50_banking / insurance / credit       0.80      0.03      0.07       115
                                  skill_k50_biotechnology / life sciences / chemistry       0.89      0.21      0.33        39
                        skill_k50_business analysis / change management / integration       1.00      0.05      0.10        97
             skill_k50_coaching / leadership development / organizational development       0.0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 5. LightGBM+bow+truncated svd

In [30]:
X_train, X_test, y_train, y_test = train_test_split(bow_df_trunc_x, bow_df_y, test_size=0.2, random_state=42)

# Initialize LightGBM classifier and MultiOutputClassifier
model_LGBM = lgb.LGBMClassifier()
multi_target_LGBM = MultiOutputClassifier(model_LGBM, n_jobs=-1)

# Fit model on training data
multi_target_LGBM.fit(X_train, y_train)

# Predict on test data
y_pred = multi_target_LGBM.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred, target_names=y_test.columns))

Accuracy: 0.16
                                                                                       precision    recall  f1-score   support

                                skill_k50_accounting / financial reporting / auditing       0.78      0.09      0.16        81
                                    skill_k50_analysis / financial analysis / finance       0.91      0.99      0.95       613
                              skill_k50_autocad / solidworks / mechanical engineering       0.00      0.00      0.00         8
                                               skill_k50_banking / insurance / credit       0.77      0.15      0.25       115
                                  skill_k50_biotechnology / life sciences / chemistry       0.87      0.33      0.48        39
                        skill_k50_business analysis / change management / integration       0.80      0.04      0.08        97
             skill_k50_coaching / leadership development / organizational development       0.0

  _warn_prf(average, modifier, msg_start, len(result))


## 6. LightGBM+bow+PCA

In [31]:
X_train, X_test, y_train, y_test = train_test_split(bow_df_pca_x, bow_df_y, test_size=0.2, random_state=42)

# Initialize LightGBM classifier and MultiOutputClassifier
model_LGBM = lgb.LGBMClassifier()
multi_target_LGBM = MultiOutputClassifier(model_LGBM, n_jobs=-1)

# Fit model on training data
multi_target_LGBM.fit(X_train, y_train)

# Predict on test data
y_pred = multi_target_LGBM.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred, target_names=y_test.columns))

Accuracy: 0.17
                                                                                       precision    recall  f1-score   support

                                skill_k50_accounting / financial reporting / auditing       1.00      0.10      0.18        81
                                    skill_k50_analysis / financial analysis / finance       0.92      0.99      0.95       613
                              skill_k50_autocad / solidworks / mechanical engineering       0.00      0.00      0.00         8
                                               skill_k50_banking / insurance / credit       0.64      0.12      0.20       115
                                  skill_k50_biotechnology / life sciences / chemistry       0.80      0.31      0.44        39
                        skill_k50_business analysis / change management / integration       0.75      0.06      0.11        97
             skill_k50_coaching / leadership development / organizational development       0.0

  _warn_prf(average, modifier, msg_start, len(result))


## 7. LightGBM+TF-IDF+truncated svd

In [32]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_df_trunc_x, tfidf_df_y, test_size=0.2, random_state=42)

# Initialize LightGBM classifier and MultiOutputClassifier
model_LGBM = lgb.LGBMClassifier()
multi_target_LGBM = MultiOutputClassifier(model_LGBM, n_jobs=-1)

# Fit model on training data
multi_target_LGBM.fit(X_train, y_train)

# Predict on test data
y_pred = multi_target_LGBM.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred, target_names=y_test.columns))

Accuracy: 0.16
                                                                                       precision    recall  f1-score   support

                                skill_k50_accounting / financial reporting / auditing       0.85      0.14      0.23        81
                                    skill_k50_analysis / financial analysis / finance       0.92      0.98      0.95       613
                              skill_k50_autocad / solidworks / mechanical engineering       0.00      0.00      0.00         8
                                               skill_k50_banking / insurance / credit       0.73      0.19      0.30       115
                                  skill_k50_biotechnology / life sciences / chemistry       0.86      0.64      0.74        39
                        skill_k50_business analysis / change management / integration       0.62      0.10      0.18        97
             skill_k50_coaching / leadership development / organizational development       0.0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 8. LightGBM+TF-IDF+PCA

In [33]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_df_pca_x, tfidf_df_y, test_size=0.2, random_state=42)

# Initialize LightGBM classifier and MultiOutputClassifier
model_LGBM = lgb.LGBMClassifier()
multi_target_LGBM = MultiOutputClassifier(model_LGBM, n_jobs=-1)

# Fit model on training data
multi_target_LGBM.fit(X_train, y_train)

# Predict on test data
y_pred = multi_target_LGBM.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred, target_names=y_test.columns))

Accuracy: 0.16
                                                                                       precision    recall  f1-score   support

                                skill_k50_accounting / financial reporting / auditing       0.85      0.14      0.23        81
                                    skill_k50_analysis / financial analysis / finance       0.92      0.98      0.95       613
                              skill_k50_autocad / solidworks / mechanical engineering       0.00      0.00      0.00         8
                                               skill_k50_banking / insurance / credit       0.87      0.23      0.36       115
                                  skill_k50_biotechnology / life sciences / chemistry       0.89      0.62      0.73        39
                        skill_k50_business analysis / change management / integration       0.62      0.10      0.18        97
             skill_k50_coaching / leadership development / organizational development       0.0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 9. Random Forest+bow+max300

In [34]:
X_train, X_test, y_train, y_test = train_test_split(bow_df_x_limited, bow_df_y_limited, test_size=0.2, random_state=42)

model_RF = RandomForestClassifier()
multi_target_RF = MultiOutputClassifier(model_RF, n_jobs=-1)

multi_target_RF.fit(X_train, y_train)

y_pred=multi_target_RF.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision, recall, and F1-score
print(classification_report(y_test, y_pred, target_names=y_test.columns))

Accuracy: 0.18
                                                                                       precision    recall  f1-score   support

                                skill_k50_accounting / financial reporting / auditing       0.67      0.02      0.05        81
                                    skill_k50_analysis / financial analysis / finance       0.91      1.00      0.95       613
                              skill_k50_autocad / solidworks / mechanical engineering       0.00      0.00      0.00         8
                                               skill_k50_banking / insurance / credit       0.83      0.04      0.08       115
                                  skill_k50_biotechnology / life sciences / chemistry       0.00      0.00      0.00        39
                        skill_k50_business analysis / change management / integration       0.67      0.06      0.11        97
             skill_k50_coaching / leadership development / organizational development       0.0

  _warn_prf(average, modifier, msg_start, len(result))


## 10. Ramdom Forest+tfidf+max300

In [35]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_df_x_limited, tfidf_df_y_limited, test_size=0.2, random_state=42)

model_RF = RandomForestClassifier()
multi_target_RF = MultiOutputClassifier(model_RF, n_jobs=-1)

multi_target_RF.fit(X_train, y_train)

y_pred=multi_target_RF.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision, recall, and F1-score
print(classification_report(y_test, y_pred, target_names=y_test.columns))

Accuracy: 0.17
                                                                                       precision    recall  f1-score   support

                                skill_k50_accounting / financial reporting / auditing       0.00      0.00      0.00        81
                                    skill_k50_analysis / financial analysis / finance       0.91      1.00      0.95       613
                              skill_k50_autocad / solidworks / mechanical engineering       0.00      0.00      0.00         8
                                               skill_k50_banking / insurance / credit       0.83      0.04      0.08       115
                                  skill_k50_biotechnology / life sciences / chemistry       0.00      0.00      0.00        39
                        skill_k50_business analysis / change management / integration       1.00      0.01      0.02        97
             skill_k50_coaching / leadership development / organizational development       0.0

  _warn_prf(average, modifier, msg_start, len(result))


## 11. LightGBM+bow+max300

In [36]:
X_train, X_test, y_train, y_test = train_test_split(bow_df_x_limited, bow_df_y_limited, test_size=0.2, random_state=42)

# Initialize LightGBM classifier and MultiOutputClassifier
model_LGBM = lgb.LGBMClassifier()
multi_target_LGBM = MultiOutputClassifier(model_LGBM, n_jobs=-1)

# Fit model on training data
multi_target_LGBM.fit(X_train, y_train)

# Predict on test data
y_pred = multi_target_LGBM.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred, target_names=y_test.columns))

Accuracy: 0.14
                                                                                       precision    recall  f1-score   support

                                skill_k50_accounting / financial reporting / auditing       0.52      0.16      0.25        81
                                    skill_k50_analysis / financial analysis / finance       0.93      0.98      0.95       613
                              skill_k50_autocad / solidworks / mechanical engineering       0.00      0.00      0.00         8
                                               skill_k50_banking / insurance / credit       0.62      0.22      0.32       115
                                  skill_k50_biotechnology / life sciences / chemistry       0.82      0.23      0.36        39
                        skill_k50_business analysis / change management / integration       0.70      0.16      0.27        97
             skill_k50_coaching / leadership development / organizational development       0.0

  _warn_prf(average, modifier, msg_start, len(result))


## 12. LightGBM+tfidf+max300

In [37]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_df_x_limited, tfidf_df_y_limited, test_size=0.2, random_state=42)

# Initialize LightGBM classifier and MultiOutputClassifier
model_LGBM = lgb.LGBMClassifier()
multi_target_LGBM = MultiOutputClassifier(model_LGBM, n_jobs=-1)

# Fit model on training data
multi_target_LGBM.fit(X_train, y_train)

# Predict on test data
y_pred = multi_target_LGBM.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred, target_names=y_test.columns))

Accuracy: 0.14
                                                                                       precision    recall  f1-score   support

                                skill_k50_accounting / financial reporting / auditing       0.52      0.16      0.25        81
                                    skill_k50_analysis / financial analysis / finance       0.93      0.98      0.96       613
                              skill_k50_autocad / solidworks / mechanical engineering       0.00      0.00      0.00         8
                                               skill_k50_banking / insurance / credit       0.58      0.19      0.29       115
                                  skill_k50_biotechnology / life sciences / chemistry       0.79      0.28      0.42        39
                        skill_k50_business analysis / change management / integration       0.75      0.12      0.21        97
             skill_k50_coaching / leadership development / organizational development       0.0

  _warn_prf(average, modifier, msg_start, len(result))
