In [129]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'student-career-prediction:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5627874%2F9295523%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240910%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240910T215638Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D76df8fb1505746109db78bbdb3adad136e3227226a1c3f87c791d9af9a99235090c5c3e098eca05bca4f4838c4ab8c44fe9c2f54d763c460d0f7127839d16322fdb43ff3fee15eef91fe4006d7229c846508a529d5f4961bf4a415e157cc4ad87a678dc244c5e02ad86fa0a316d93e86349a8a94177bb25dfde6b43515ee53e348881bafe96c6a897de9796edd6650947cc7529e177c24c69d66567d85b5bed496d2a88fec6717ac0d641bab7050ac587f7914f93e79574578a00e93591bfdbe63dcccff01745c00ebab61d6ab45aa5eb4a4f5a4c160b1d9ed40dedcf4972f3701784d791a5f10e9f9a963159faa2b51e1fe40aac3fa3b469e2d6854b75eb3aa'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading student-career-prediction, 21112 bytes compressed
Downloaded and uncompressed: student-career-prediction
Data source import complete.


# **upload data**

In [130]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [131]:
data = pd.read_csv('/kaggle/input/student-career-prediction/stud_training.csv')
data.head()

Unnamed: 0,Drawing,Dancing,Singing,Sports,Video Game,Acting,Travelling,Gardening,Animals,Photography,...,Engeeniering,Doctor,Pharmisist,Cycling,Knitting,Director,Journalism,Bussiness,Listening Music,Courses
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,BBA- Bachelor of Business Administration
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,BBA- Bachelor of Business Administration
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,BBA- Bachelor of Business Administration
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,BBA- Bachelor of Business Administration
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,BBA- Bachelor of Business Administration


In [132]:
testing_data =  pd.read_csv('/kaggle/input/student-career-prediction/stud_testing.csv')
testing_data.head()

Unnamed: 0,Drawing,Dancing,Singing,Sports,Video Game,Acting,Travelling,Gardening,Animals,Photography,...,Engeeniering,Doctor,Pharmisist,Cycling,Knitting,Director,Journalism,Bussiness,Listening Music,Courses
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,BBA- Bachelor of Business Administration
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,BEM- Bachelor of Event Management
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Integrated Law Course- BA + LL.B
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,BJMC- Bachelor of Journalism and Mass Communic...
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,BFD- Bachelor of Fashion Designing


# **Preprocessing data**

In [133]:
data.shape

(3500, 60)

In [134]:
data.duplicated().sum()

3313

In [135]:
data.shape

(3500, 60)

In [136]:
data.drop_duplicates(inplace=True)

In [137]:
testing_data.shape

(35, 60)

In [138]:
data.isnull().sum()

Unnamed: 0,0
Drawing,0
Dancing,0
Singing,0
Sports,0
Video Game,0
Acting,0
Travelling,0
Gardening,0
Animals,0
Photography,0


In [139]:
data.columns

Index(['Drawing', 'Dancing', 'Singing', 'Sports', 'Video Game', 'Acting',
       'Travelling', 'Gardening', 'Animals', 'Photography', 'Teaching',
       'Exercise', 'Coding', 'Electricity Components', 'Mechanic Parts',
       'Computer Parts', 'Researching', 'Architecture', 'Historic Collection',
       'Botany', 'Zoology', 'Physics', 'Accounting', 'Economics', 'Sociology',
       'Geography', 'Psycology', 'History', 'Science', 'Bussiness Education',
       'Chemistry', 'Mathematics', 'Biology', 'Makeup', 'Designing',
       'Content writing', 'Crafting', 'Literature', 'Reading', 'Cartooning',
       'Debating', 'Asrtology', 'Hindi', 'French', 'English', 'Urdu',
       'Other Language', 'Solving Puzzles', 'Gymnastics', 'Yoga',
       'Engeeniering', 'Doctor', 'Pharmisist', 'Cycling', 'Knitting',
       'Director', 'Journalism', 'Bussiness', 'Listening Music', 'Courses'],
      dtype='object')

In [140]:
data['Sports'] = (data['Sports'] + data['Cycling'] + data['Gymnastics'] + data['Exercise'] + data['Dancing']).replace([2, 3, 4, 5] , 1)
data['Languages'] = (data['Hindi'] + data['Urdu'] + data['Other Language']).replace([2, 3] , 1)
data['Animals'] = (data['Animals'] + data['Zoology']).replace([2] , 1)
data.drop(['Hindi', 'Urdu', 'Other Language', 'Cycling', 'Gymnastics', 'Exercise', 'Dancing' , 'Engeeniering' , 'Doctor' , 'Zoology'], axis=1, inplace=True)

In [141]:
testing_data['Sports'] = (testing_data['Sports'] + testing_data['Cycling'] + testing_data['Gymnastics'] + testing_data['Exercise'] + testing_data['Dancing']).replace([2, 3, 4, 5] , 1)
testing_data['Languages'] = (testing_data['Hindi'] + testing_data['Urdu'] + testing_data['Other Language']).replace([2, 3] , 1)
testing_data['Animals'] = (testing_data['Animals'] + testing_data['Zoology']).replace([2] , 1)
testing_data.drop(['Hindi', 'Urdu', 'Other Language', 'Cycling', 'Gymnastics', 'Exercise', 'Dancing' , 'Engeeniering' , 'Doctor' , 'Zoology'], axis=1, inplace=True)

In [142]:
data.shape

(187, 51)

In [143]:
testing_data.shape

(35, 51)

In [144]:
data.columns

Index(['Drawing', 'Singing', 'Sports', 'Video Game', 'Acting', 'Travelling',
       'Gardening', 'Animals', 'Photography', 'Teaching', 'Coding',
       'Electricity Components', 'Mechanic Parts', 'Computer Parts',
       'Researching', 'Architecture', 'Historic Collection', 'Botany',
       'Physics', 'Accounting', 'Economics', 'Sociology', 'Geography',
       'Psycology', 'History', 'Science', 'Bussiness Education', 'Chemistry',
       'Mathematics', 'Biology', 'Makeup', 'Designing', 'Content writing',
       'Crafting', 'Literature', 'Reading', 'Cartooning', 'Debating',
       'Asrtology', 'French', 'English', 'Solving Puzzles', 'Yoga',
       'Pharmisist', 'Knitting', 'Director', 'Journalism', 'Bussiness',
       'Listening Music', 'Courses', 'Languages'],
      dtype='object')

In [145]:
new_order = ['Drawing', 'Singing', 'Sports', 'Video Game', 'Acting', 'Travelling',
       'Gardening', 'Animals', 'Photography', 'Teaching', 'Coding',
       'Electricity Components', 'Mechanic Parts', 'Computer Parts',
       'Researching', 'Architecture', 'Historic Collection', 'Botany',
       'Physics', 'Accounting', 'Economics', 'Sociology', 'Geography',
       'Psycology', 'History', 'Science', 'Bussiness Education', 'Chemistry',
       'Mathematics', 'Biology', 'Makeup', 'Designing', 'Content writing',
       'Crafting', 'Literature', 'Reading', 'Cartooning', 'Debating',
       'Asrtology', 'French', 'English', 'Solving Puzzles', 'Yoga',
       'Pharmisist', 'Knitting', 'Director', 'Journalism', 'Bussiness',
       'Listening Music', 'Languages', 'Courses']

data = data[new_order]
testing_data = testing_data[new_order]
data.head()

Unnamed: 0,Drawing,Singing,Sports,Video Game,Acting,Travelling,Gardening,Animals,Photography,Teaching,...,Solving Puzzles,Yoga,Pharmisist,Knitting,Director,Journalism,Bussiness,Listening Music,Languages,Courses
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,BBA- Bachelor of Business Administration
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,BBA- Bachelor of Business Administration
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,BBA- Bachelor of Business Administration
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,BBA- Bachelor of Business Administration
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,BBA- Bachelor of Business Administration


In [146]:
from sklearn.preprocessing import LabelEncoder

# Create the encoder
encoder = LabelEncoder()

data['Courses'] = encoder.fit_transform(data['Courses'])
testing_data['Courses'] = encoder.transform(testing_data['Courses'])


In [147]:
data.to_csv('data.csv', index=False)
testing_data.to_csv('testing_data.csv', index=False)

# **Model**

In [148]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score , accuracy_score , confusion_matrix


In [149]:
X = data.drop(['Courses'], axis = 1)
y = data['Courses']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_testing = testing_data.drop(['Courses'], axis = 1)
y_testing = testing_data['Courses']

In [150]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


Random Forest

In [151]:
from sklearn.ensemble import RandomForestClassifier

In [152]:
model1 = RandomForestClassifier()

# training model
model1.fit(X_train , y_train)

# Prediction
y_pred1 = model1.predict(X_test)

In [153]:


accuracy1 = accuracy_score(y_pred1, y_test)
print(f'accuracy: {accuracy1}')

# Precision, Recall, F1 Score
precision1 = precision_score(y_test, y_pred1 , average='macro')
recall1 = recall_score(y_test, y_pred1,average='macro')
f11 = f1_score(y_test, y_pred1,average='macro')


print(f'precision: {precision1}')
print(f'Recall: {recall1}')
print(f'F1 Score: {f11}')

accuracy: 0.8947368421052632
precision: 0.85
Recall: 0.875
F1 Score: 0.8571428571428571


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [177]:
pred_testing = model1.predict(X_testing)
accuracy_testing = accuracy_score(pred_testing, y_testing)
accuracy_testing

0.9428571428571428

Decision Tree

In [155]:
from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)
y_pred2 = dt_classifier.predict(X_test)
accuracy2 = accuracy_score(y_test, y_pred2)

print(f'Accuracy: {accuracy2}')


# Precision, Recall, F1 Score
precision2 = precision_score(y_test, y_pred2,average='macro')
recall2 = recall_score(y_test, y_pred2,average='macro')
f12 = f1_score(y_test, y_pred2,average='macro')


print(f'Precision: {precision2}')
print(f'Recall: {recall2}')
print(f'F1 Score: {f12}')



Accuracy: 0.6842105263157895
Precision: 0.59
Recall: 0.64
F1 Score: 0.5796190476190476


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [176]:
pred_testing = dt_classifier.predict(X_testing)
accuracy_testing = accuracy_score(pred_testing, y_testing)
accuracy_testing

0.9428571428571428

XGB

In [157]:
from xgboost import XGBClassifier

model2 = XGBClassifier()
model2.fit(X_train, y_train)
y_pred3 = model2.predict(X_test)


accuracy3 = accuracy_score(y_test, y_pred3)

print(f'Accuracy: {accuracy3}')

precision3 = precision_score(y_test, y_pred3,average='micro')
recall3 = recall_score(y_test, y_pred3,average='micro')
f13 = f1_score(y_test, y_pred3,average='micro')

# Print individual scores
print(f'Precision: {precision3}')
print(f'Recall: {recall3}')
print(f'F1 Score: {f13}')

Accuracy: 0.868421052631579
Precision: 0.868421052631579
Recall: 0.868421052631579
F1 Score: 0.868421052631579


In [159]:
y_pred3

array([ 3, 22,  1, 29, 34, 33,  5, 32,  1, 16, 32, 28, 31, 20,  1, 33, 31,
       16,  4, 25, 14, 12, 19, 30, 25, 32, 23, 29, 15, 34, 25, 17, 22,  5,
        5, 27,  6, 30])

# **Saving the Model**

In [174]:
# Save the models for future use
import joblib

joblib.dump(model1, 'career_prediction_model.joblib')

['career_prediction_model.joblib']

# Functions

In [173]:
def career(pred):
  careers = ['BBA- Bachelor of Business Administration',
       'BEM- Bachelor of Event Management',
       'Integrated Law Course- BA + LL.B',
       'BJMC- Bachelor of Journalism and Mass Communication',
       'BFD- Bachelor of Fashion Designing',
       'BBS- Bachelor of Business Studies',
       'BTTM- Bachelor of Travel and Tourism Management',
       'BVA- Bachelor of Visual Arts', 'BA in History',
       'B.Arch- Bachelor of Architecture',
       'BCA- Bachelor of Computer Applications',
       'B.Sc.- Information Technology', 'B.Sc- Nursing',
       'BPharma- Bachelor of Pharmacy', 'BDS- Bachelor of Dental Surgery',
       'Animation, Graphics and Multimedia', 'B.Sc- Applied Geology',
       'B.Sc.- Physics', 'B.Sc. Chemistry', 'B.Sc. Mathematics',
       'B.Tech.-Civil Engineering',
       'B.Tech.-Computer Science and Engineering',
       'B.Tech.-Electronics and Communication Engineering',
       'B.Tech.-Electrical and Electronics Engineering',
       'B.Tech.-Mechanical Engineering', 'B.Com- Bachelor of Commerce',
       'BA in Economics', 'CA- Chartered Accountancy',
       'CS- Company Secretary', 'Diploma in Dramatic Arts', 'MBBS',
       'Civil Services', 'BA in English', 'BA in Hindi', 'B.Ed.']
  result = []
  for i in list(pred):
       result.append(careers[i])

       return result