In [49]:
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
# from tensorflow.keras.callbacks import EarlyStopping
# import pickle
# import tensorflow as tf

# # ------------------- Load and preprocess data -------------------
# data = pd.read_csv('career_dataset_large.csv')

# cols = ['Skills', 'Interests', 'Personality', 'Experience', 'Career']

# # Convert values to lowercase
# for col in cols:
#     if col in ['Skills', 'Interests']:
#         data[col] = data[col].apply(lambda x: ', '.join([i.strip().lower() for i in x.split(',')]))
#     else:
#         data[col] = data[col].str.lower()

# # Convert Skills and Interests into lists
# data['Skills'] = data['Skills'].apply(lambda x: [s.strip() for s in x.split(',')])
# data['Interests'] = data['Interests'].apply(lambda x: [i.strip() for i in x.split(',')])

# # MultiLabelBinarizer for Skills and Interests
# mlb_skills = MultiLabelBinarizer()
# skills_encoded = mlb_skills.fit_transform(data['Skills'])

# mlb_interests = MultiLabelBinarizer()
# interests_encoded = mlb_interests.fit_transform(data['Interests'])

# skills_df = pd.DataFrame(skills_encoded, columns=mlb_skills.classes_).reset_index(drop=True)
# interests_df = pd.DataFrame(interests_encoded, columns=mlb_interests.classes_).reset_index(drop=True)

# # OneHotEncoder for Personality, Experience, Career
# ohe = OneHotEncoder(sparse=False)
# features_to_ohe = data[['Personality','Experience','Career']]
# ohe_encoded_features = ohe.fit_transform(features_to_ohe)
# ohe_df = pd.DataFrame(ohe_encoded_features, columns=ohe.get_feature_names_out(features_to_ohe.columns)).reset_index(drop=True)

# # Combine all features
# final_df = pd.concat([skills_df, interests_df, ohe_df], axis=1)

# # Split features and target
# X = final_df.drop(columns=[col for col in final_df.columns if 'Career_' in col])
# y = final_df[[col for col in final_df.columns if 'Career_' in col]]

# # Save career columns for API
# career_columns = y.columns.tolist()
# with open("career_columns.pkl", "wb") as f:
#     pickle.dump(career_columns, f)

# # ------------------- Train/Test split -------------------
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42
# )

# # ------------------- Build model -------------------
# num_classes = y.shape[1]

# model = Sequential([
#     Dense(128, input_dim=X_train.shape[1]),
#     BatchNormalization(),
#     tf.keras.layers.Activation('relu'),
#     Dropout(0.4),

#     Dense(64),
#     BatchNormalization(),
#     tf.keras.layers.Activation('relu'),
#     Dropout(0.2),

#     Dense(32),
#     BatchNormalization(),
#     tf.keras.layers.Activation('relu'),

#     Dense(num_classes, activation='softmax')
# ])

# early_stop = EarlyStopping(
#     monitor='val_loss',
#     patience=10,
#     restore_best_weights=True
# )

# model.compile(
#     optimizer='adam',
#     loss='categorical_crossentropy',
#     metrics=['accuracy']
# )

# history = model.fit(
#     X_train, y_train,
#     validation_data=(X_test, y_test),
#     epochs=50,
#     batch_size=32,
#     callbacks=[early_stop],
#     verbose=1
# )

# loss, accuracy = model.evaluate(X_test, y_test)
# print(f"Test Accuracy: {accuracy*100:.2f}%")

# # ------------------- Save model -------------------
# model.save('career_recommender_model.h5')

# # ------------------- Save feature map for API -------------------
# import pickle

# feature_map = {col: idx for idx, col in enumerate(X.columns)}

# with open("feature_map.pkl", "wb") as f:
#     pickle.dump(feature_map, f)

# print("Feature map saved. Total features:", len(feature_map))

# print("Model, feature_map.pkl, and career_columns.pkl saved successfully!")


In [50]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
import pickle
import tensorflow as tf

In [51]:
data = pd.read_csv('career_dataset_large.csv')

In [52]:
cols = ['Skills', 'Interests', 'Personality', 'Experience', 'Career']

# Convert values to lowercase
for col in cols:
    if col in ['Skills', 'Interests']:
        # Split the comma-separated string, lowercase each item, then join back
        data[col] = data[col].apply(lambda x: ', '.join([i.strip().lower() for i in x.split(',')]))
    else:
        # Just lowercase the string
        data[col] = data[col].str.lower()

# Check result
print(data.head())

                                        Skills  \
0       python, ml, cloud, data analysis, java   
1              java, cloud, sql, data analysis   
2                             sql, java, cloud   
3                                   ml, python   
4  cloud, java, sql, data analysis, management   

                           Interests Personality    Experience  \
0                   research, coding     logical  intermediate   
1            problem solving, coding  analytical      advanced   
2  problem solving, coding, analysis  analytical      advanced   
3           coding, research, devops     logical      beginner   
4            coding, problem solving  analytical  intermediate   

              Career  
0        ai engineer  
1  backend developer  
2  backend developer  
3        ai engineer  
4  backend developer  


In [53]:
data['Skills'] = data['Skills'].apply(lambda x: [s.strip() for s in x.split(',')])
data['Interests'] = data['Interests'].apply(lambda x: [i.strip() for i in x.split(',')])

In [54]:
mlb_skills = MultiLabelBinarizer()
skills_encoded = mlb_skills.fit_transform(data['Skills'])

mlb_interests = MultiLabelBinarizer()
interests_encoded = mlb_interests.fit_transform(data['Interests'])

In [55]:
skills_df = pd.DataFrame(skills_encoded, columns=mlb_skills.classes_).reset_index(drop=True)
interests_df = pd.DataFrame(interests_encoded, columns=mlb_interests.classes_).reset_index(drop=True)


In [56]:
ohe = OneHotEncoder(sparse=False)
features_to_ohe = data[['Personality','Experience','Career']]
ohe_encoded_features = ohe.fit_transform(features_to_ohe)
ohe_df = pd.DataFrame(ohe_encoded_features, columns=ohe.get_feature_names_out(features_to_ohe.columns)).reset_index(drop=True)




In [57]:
final_df = pd.concat([skills_df, interests_df, ohe_df], axis=1)


In [58]:


# Split features and target
X = final_df.drop(columns=[col for col in final_df.columns if 'Career_' in col])
y = final_df[[col for col in final_df.columns if 'Career_' in col]]



In [59]:
X = X.loc[:, ~X.columns.duplicated()]
print("X shape after removing duplicates:", X.shape)

X shape after removing duplicates: (4000, 23)


In [60]:
print("X shape:", X.shape)
print("Columns:", X.columns.tolist())

X shape: (4000, 23)
Columns: ['cloud', 'data analysis', 'design', 'html', 'java', 'management', 'ml', 'networking', 'python', 'sql', 'analysis', 'coding', 'devops', 'leadership', 'problem solving', 'research', 'Personality_analytical', 'Personality_creative', 'Personality_logical', 'Personality_practical', 'Experience_advanced', 'Experience_beginner', 'Experience_intermediate']


In [61]:
# ------------------- Train/Test split -------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [62]:
# ------------------- Build model -------------------
num_classes = y.shape[1]

model = Sequential([
    Dense(128, input_dim=X_train.shape[1]),
    BatchNormalization(),
    tf.keras.layers.Activation('relu'),
    Dropout(0.4),

    Dense(64),
    BatchNormalization(),
    tf.keras.layers.Activation('relu'),
    Dropout(0.2),

    Dense(32),
    BatchNormalization(),
    tf.keras.layers.Activation('relu'),

    Dense(num_classes, activation='softmax')
])

In [63]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)




In [64]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [65]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50


In [66]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")


Test Accuracy: 95.75%


In [67]:
# ------------------- Save model -------------------
model.save('career_recommender_model.h5')

  saving_api.save_model(


In [68]:
# Save career columns for API
career_columns = y.columns.tolist()
with open("career_columns.pkl", "wb") as f:
    pickle.dump(career_columns, f)

In [69]:


# ------------------- Save feature map for API -------------------
import pickle

feature_map = {col: idx for idx, col in enumerate(X.columns)}

with open("feature_map.pkl", "wb") as f:
    pickle.dump(feature_map, f)

print("Feature map saved. Total features:", len(feature_map))

print("Model, feature_map.pkl, and career_columns.pkl saved successfully!")


Feature map saved. Total features: 23
Model, feature_map.pkl, and career_columns.pkl saved successfully!
