In [2]:
import pandas as pd
import json

# Load the dataset from JSON file
with open('dataset.json', 'r') as file:
    data = json.load(file)

# Create a DataFrame
df = pd.DataFrame(data)

# Drop columns
columns_to_drop = ["studentid", "roll", "studentname", "package"]
df.drop(columns=columns_to_drop, inplace=True)

print(df)


                           selected thubornot    owlhero  \
0     TCS Digital,Tyke,ZF India,DXC      Thub  OWL CODER   
1                               DXC      None       None   
2                         TCS Ninja      None       None   
3                               JIO      None       None   
4                               DXC      None       None   
...                             ...       ...        ...   
1408                        Kyndryl      None       None   
1409                       Hexaware      None       None   
1410        Value Momentum,Hexaware      None       None   
1411                 Pentagon Space      None       None   
1412                       Hexaware      None       None   

                                         certifications  \
0     AWS Certified Cloud Practitione , ,AWS Certifi...   
1                   , ,AWS Certified Cloud Practitioner   
2                   , ,AWS Certified Cloud Practitioner   
3                                          

In [3]:
df.head()

Unnamed: 0,selected,thubornot,owlhero,certifications,skills
0,"TCS Digital,Tyke,ZF India,DXC",Thub,OWL CODER,"AWS Certified Cloud Practitione , ,AWS Certifi...","aws, mongo db"
1,DXC,,,", ,AWS Certified Cloud Practitioner","html, node js"
2,TCS Ninja,,,", ,AWS Certified Cloud Practitioner","blender, blender"
3,JIO,,,,html
4,DXC,,,",Oracle Cloud Infrastructure Foundations Asso...","mongo db, machine learning"


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer

# Features (X) and target variable (y)
X = df[['selected', 'thubornot', 'owlhero', 'certifications', 'skills']]
y = df['selected']

# Tokenize labels (y) using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_tokenized = mlb.fit_transform(y.str.split(','))

# Split the data into training and testing sets with a 70-30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y_tokenized, test_size=0.3, random_state=42)

def tokenize(text):
    return text.split(',')
categorical_cols = ['selected', 'thubornot', 'owlhero', 'certifications', 'skills']
categorical_transformer = ColumnTransformer(
    transformers=[
        ('selected', CountVectorizer(tokenizer=tokenize), 'selected'),
        ('thubornot', CountVectorizer(tokenizer=tokenize), 'thubornot'),
        ('owlhero', CountVectorizer(tokenizer=tokenize), 'owlhero'),
        ('certifications', CountVectorizer(tokenizer=tokenize), 'certifications'),
        ('skills', CountVectorizer(tokenizer=tokenize), 'skills')
    ])
rf_model = Pipeline(steps=[('preprocessor', categorical_transformer),
                           ('classifier', RandomForestClassifier(random_state=42))])
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)




Accuracy: 0.7122641509433962


In [15]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'classifier__n_estimators': [5, 10, 20],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)




Best Parameters: {'classifier__max_depth': None, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 20}
Best Accuracy: 0.6441111623852741
Test Accuracy: 0.6981132075471698


In [17]:

import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
test_cases = [
    {
       
        'thubornot': 'Thub',
        'owlhero': 'OWL CODER',
        'certifications': 'AWS Certified Cloud Practitioner , ,AWS Certified Cloud Practitioner',
        'skills': 'python'
    },
    {
        
        'thubornot': 'None',
        'owlhero': 'None',
        'certifications': 'None',
        'skills': 'python'
    }

]


def preprocess_test_cases(test_cases):
    test_df = pd.DataFrame(test_cases)
    test_df['selected'] = test_df['selected'].apply(lambda x: ','.join(x.split(',')))
    return test_df

test_df = preprocess_test_cases(test_cases)

predictions = rf_model.predict(test_df)
predicted_labels = mlb.inverse_transform(predictions)
print("Predicted labels:")
for labels in predicted_labels:
    print(labels)


KeyError: 'selected'