In [2]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'student-career-prediction:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5627874%2F9295523%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240910%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240910T215638Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D76df8fb1505746109db78bbdb3adad136e3227226a1c3f87c791d9af9a99235090c5c3e098eca05bca4f4838c4ab8c44fe9c2f54d763c460d0f7127839d16322fdb43ff3fee15eef91fe4006d7229c846508a529d5f4961bf4a415e157cc4ad87a678dc244c5e02ad86fa0a316d93e86349a8a94177bb25dfde6b43515ee53e348881bafe96c6a897de9796edd6650947cc7529e177c24c69d66567d85b5bed496d2a88fec6717ac0d641bab7050ac587f7914f93e79574578a00e93591bfdbe63dcccff01745c00ebab61d6ab45aa5eb4a4f5a4c160b1d9ed40dedcf4972f3701784d791a5f10e9f9a963159faa2b51e1fe40aac3fa3b469e2d6854b75eb3aa'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading student-career-prediction, 21112 bytes compressed
Downloaded and uncompressed: student-career-prediction
Data source import complete.


# **upload data**

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
data = pd.read_csv('/kaggle/input/student-career-prediction/stud_training.csv')
data.head()

Unnamed: 0,Drawing,Dancing,Singing,Sports,Video Game,Acting,Travelling,Gardening,Animals,Photography,...,Engeeniering,Doctor,Pharmisist,Cycling,Knitting,Director,Journalism,Bussiness,Listening Music,Courses
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,BBA- Bachelor of Business Administration
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,BBA- Bachelor of Business Administration
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,BBA- Bachelor of Business Administration
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,BBA- Bachelor of Business Administration
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,BBA- Bachelor of Business Administration


In [31]:
testing_data =  pd.read_csv('/kaggle/input/student-career-prediction/stud_testing.csv')
testing_data.head()

Unnamed: 0,Drawing,Dancing,Singing,Sports,Video Game,Acting,Travelling,Gardening,Animals,Photography,...,Engeeniering,Doctor,Pharmisist,Cycling,Knitting,Director,Journalism,Bussiness,Listening Music,Courses
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,BBA- Bachelor of Business Administration
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,BEM- Bachelor of Event Management
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Integrated Law Course- BA + LL.B
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,BJMC- Bachelor of Journalism and Mass Communic...
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,BFD- Bachelor of Fashion Designing


# **Preprocessing data**

In [12]:
data.shape

(3500, 60)

In [39]:
data.duplicated().sum()

3323

In [40]:
data.drop_duplicates(inplace=True)

In [32]:
testing_data.shape

(35, 60)

In [15]:
data.isnull().sum()

Unnamed: 0,0
Drawing,0
Dancing,0
Singing,0
Sports,0
Video Game,0
Acting,0
Travelling,0
Gardening,0
Animals,0
Photography,0


In [18]:
data.columns

Index(['Drawing', 'Dancing', 'Singing', 'Sports', 'Video Game', 'Acting',
       'Travelling', 'Gardening', 'Animals', 'Photography', 'Teaching',
       'Exercise', 'Coding', 'Electricity Components', 'Mechanic Parts',
       'Computer Parts', 'Researching', 'Architecture', 'Historic Collection',
       'Botany', 'Zoology', 'Physics', 'Accounting', 'Economics', 'Sociology',
       'Geography', 'Psycology', 'History', 'Science', 'Bussiness Education',
       'Chemistry', 'Mathematics', 'Biology', 'Makeup', 'Designing',
       'Content writing', 'Crafting', 'Literature', 'Reading', 'Cartooning',
       'Debating', 'Asrtology', 'Hindi', 'French', 'English', 'Urdu',
       'Other Language', 'Solving Puzzles', 'Gymnastics', 'Yoga',
       'Engeeniering', 'Doctor', 'Pharmisist', 'Cycling', 'Knitting',
       'Director', 'Journalism', 'Bussiness', 'Listening Music', 'Courses'],
      dtype='object')

In [19]:
data['Sports'] = (data['Sports'] + data['Cycling'] + data['Gymnastics'] + data['Exercise'] + data['Dancing']).replace([2, 3, 4, 5] , 1)
data['Languages'] = (data['Hindi'] + data['Urdu'] + data['Other Language']).replace([2, 3] , 1)
data['Animals'] = (data['Animals'] + data['Zoology']).replace([2] , 1)
data.drop(['Hindi', 'Urdu', 'Other Language', 'Cycling', 'Gymnastics', 'Exercise', 'Dancing' , 'Engeeniering' , 'Doctor' , 'Zoology'], axis=1, inplace=True)

In [34]:
testing_data['Sports'] = (testing_data['Sports'] + testing_data['Cycling'] + testing_data['Gymnastics'] + testing_data['Exercise'] + testing_data['Dancing']).replace([2, 3, 4, 5] , 1)
testing_data['Languages'] = (testing_data['Hindi'] + testing_data['Urdu'] + testing_data['Other Language']).replace([2, 3] , 1)
testing_data['Animals'] = (testing_data['Animals'] + testing_data['Zoology']).replace([2] , 1)
testing_data.drop(['Hindi', 'Urdu', 'Other Language', 'Cycling', 'Gymnastics', 'Exercise', 'Dancing' , 'Engeeniering' , 'Doctor' , 'Zoology'], axis=1, inplace=True)

In [20]:
data.shape

(3500, 51)

In [35]:
testing_data.shape

(35, 51)

# **Model**

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [42]:
x = data.drop(['Courses'] , axis=1)
y = data['Courses']

In [43]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [44]:
model = RandomForestClassifier()

# training model
model.fit(x_train , y_train)

# Prediction
pred = model.predict(x_test)

In [45]:
accuracy = accuracy_score(pred, y_test)
accuracy

0.8888888888888888

In [46]:
x_testing = testing_data.drop(['Courses'] , axis=1)
y_testing = testing_data['Courses']

In [47]:
pred_testing = model.predict(x_testing)
accuracy_testing = accuracy_score(pred_testing, y_testing)
accuracy_testing

0.9714285714285714

In [38]:
# Save the models for future use
import joblib

joblib.dump(model, 'career_prediction_model.joblib')

['career_prediction_model.joblib']