# Importing Libraries

In [1]:
import pandas as pd

In [2]:
data=pd.read_csv("student-scores.csv")

In [3]:
data["career_aspiration"].unique()

array(['Lawyer', 'Doctor', 'Government Officer', 'Artist', 'Unknown',
       'Software Engineer', 'Teacher', 'Business Owner', 'Scientist',
       'Banker', 'Writer', 'Accountant', 'Designer',
       'Construction Engineer', 'Game Developer', 'Stock Investor',
       'Real Estate Developer'], dtype=object)

In [4]:
data.head()

Unnamed: 0,id,first_name,last_name,email,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,1,Paul,Casey,paul.casey.1@gslingacademy.com,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87
1,2,Danielle,Sandoval,danielle.sandoval.2@gslingacademy.com,female,False,2,False,47,Doctor,90,86,96,100,90,88,90
2,3,Tina,Andrews,tina.andrews.3@gslingacademy.com,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94
3,4,Tara,Clark,tara.clark.4@gslingacademy.com,female,False,5,False,3,Artist,71,74,88,80,89,63,86
4,5,Anthony,Campos,anthony.campos.5@gslingacademy.com,male,False,5,False,10,Unknown,84,77,65,65,80,74,76


# Drop Irrevalent columns that no need

In [5]:
data.drop(columns=["id",'first_name','last_name','email'],axis=1,inplace=True)

# Create 2 new columns / features sum and avg of all subjects

In [6]:
data["total_score"]=data["math_score"] + data["history_score"] + data["physics_score"] + data["chemistry_score"] + data["biology_score"] + data["english_score"] + data["geography_score"]

data["average"] = data["total_score"] / 7
data.head()

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average
0,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87,574,82.0
1,female,False,2,False,47,Doctor,90,86,96,100,90,88,90,640,91.428571
2,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94,605,86.428571
3,female,False,5,False,3,Artist,71,74,88,80,89,63,86,551,78.714286
4,male,False,5,False,10,Unknown,84,77,65,65,80,74,76,521,74.428571


In [7]:
data["gender"]=data["gender"].map({"male":0,"female":1})


In [8]:
data["extracurricular_activities"].value_counts()

extracurricular_activities
False    1592
True      408
Name: count, dtype: int64

In [9]:
data["extracurricular_activities"]=data["extracurricular_activities"].map({True:1,False:0})
data.head()

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average
0,0,False,3,0,27,Lawyer,73,81,93,97,63,80,87,574,82.0
1,1,False,2,0,47,Doctor,90,86,96,100,90,88,90,640,91.428571
2,1,False,9,1,13,Government Officer,81,97,95,96,65,77,94,605,86.428571
3,1,False,5,0,3,Artist,71,74,88,80,89,63,86,551,78.714286
4,0,False,5,0,10,Unknown,84,77,65,65,80,74,76,521,74.428571


In [10]:
data["part_time_job"].value_counts()

part_time_job
False    1684
True      316
Name: count, dtype: int64

In [11]:
data["part_time_job"]=data["part_time_job"].map({False:0,True:1})

In [14]:
data["part_time_job"].value_counts()

part_time_job
0    1684
1     316
Name: count, dtype: int64

In [19]:
career_aspiration={'Lawyer':0, 'Doctor':1, 'Government Officer':2, 'Artist':3, 'Unknown':4,
       'Software Engineer':5, 'Teacher':6, 'Business Owner':7, 'Scientist':8,
       'Banker':9, 'Writer':10, 'Accountant':11, 'Designer':12,
       'Construction Engineer':13, 'Game Developer':14, 'Stock Investor':15,
       'Real Estate Developer':16}
data["career_aspiration"]=data["career_aspiration"].map(career_aspiration)

In [20]:
data.head()

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average
0,0,0,3,0,27,0,73,81,93,97,63,80,87,574,82.0
1,1,0,2,0,47,1,90,86,96,100,90,88,90,640,91.428571
2,1,0,9,1,13,2,81,97,95,96,65,77,94,605,86.428571
3,1,0,5,0,3,3,71,74,88,80,89,63,86,551,78.714286
4,0,0,5,0,10,4,84,77,65,65,80,74,76,521,74.428571


In [21]:
data.shape

(2000, 15)

# Balance the dataset

In [22]:
data["career_aspiration"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16],
      dtype=int64)

In [23]:
data["career_aspiration"].value_counts()

career_aspiration
5     315
7     309
4     223
9     169
0     138
11    126
1     119
16     83
15     73
13     68
3      67
14     63
2      61
6      59
12     56
8      39
10     32
Name: count, dtype: int64

# using SMOTE for dataset balance

In [24]:
from imblearn.over_sampling import SMOTE
#Create object of SMOTE
smote=SMOTE(random_state=40)
#dependent and independent features\

X=data.drop("career_aspiration",axis=1)
y=data["career_aspiration"]

#Apply smote on data
X_resampled,y_resampled=smote.fit_resample(X,y)


In [25]:
y_resampled.value_counts()

career_aspiration
0     315
9     315
15    315
14    315
13    315
12    315
11    315
10    315
8     315
1     315
7     315
6     315
5     315
4     315
3     315
2     315
16    315
Name: count, dtype: int64

# Train test split

In [26]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_resampled,y_resampled,test_size=0.20,random_state=42)

In [27]:
X_train

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average
2369,0,0,0,0,47,97,88,96,97,90,97,93,661,94.476429
3955,1,0,3,0,23,81,73,93,78,35,96,82,541,77.317686
3785,1,0,1,0,16,92,74,99,87,61,95,94,604,86.360020
120,1,1,0,0,13,66,90,83,80,76,90,91,576,82.285714
3699,0,0,2,0,29,72,82,70,66,82,97,70,543,77.581881
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3092,1,0,1,0,11,66,74,95,61,92,89,87,566,80.966712
3772,1,0,2,0,21,69,76,82,92,91,90,70,573,81.972892
5191,0,0,4,1,5,84,82,66,89,82,90,77,573,81.956769
5226,0,0,3,0,4,89,74,64,72,66,76,83,527,75.400787


# difference between value is greator so use feature scalling

In [28]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [29]:
X_test_scaled

array([[-0.77710596, -0.25418955, -0.45742417, ...,  1.11623591,
         0.74178014,  0.74012745],
       [ 1.2868258 ,  3.93407207,  1.39169605, ...,  1.11623591,
        -0.50018942, -0.50781232],
       [ 1.2868258 , -0.25418955, -0.91970423, ..., -0.46442186,
         0.35366465,  0.34534632],
       ...,
       [-0.77710596, -0.25418955,  0.46713594, ...,  0.5583567 ,
         0.66415705,  0.66892434],
       [ 1.2868258 , -0.25418955,  0.92941599, ...,  1.20921578,
         1.44038803,  1.45133605],
       [-0.77710596, -0.25418955,  1.8539761 , ...,  1.76709499,
        -0.00857647, -0.01659977]])

# Random Forest Model training

In [30]:
from sklearn.ensemble import RandomForestClassifier
random_forest=RandomForestClassifier()
random_forest.fit(X_train_scaled,y_train)



In [31]:
y_pred=random_forest.predict(X_test_scaled)


In [32]:
y_pred

array([15,  7,  0, ..., 12, 14, 14], dtype=int64)

In [33]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy=accuracy_score(y_test,y_pred)
confusion_matrix=confusion_matrix(y_test,y_pred)
classification_report=classification_report(y_test,y_pred)
print("accuracy:",accuracy)
print("Classification Report:",classification_report)
print("confusion matrix",confusion_matrix)

accuracy: 0.8216619981325863
Classification Report:               precision    recall  f1-score   support

           0       0.72      0.87      0.79        68
           1       0.82      0.92      0.87        72
           2       0.84      0.93      0.88        57
           3       0.90      0.95      0.92        58
           4       0.71      0.33      0.45        66
           5       0.62      0.42      0.50        76
           6       0.87      0.94      0.91        71
           7       0.97      0.92      0.94        61
           8       0.75      0.94      0.83        53
           9       0.65      0.61      0.63        61
          10       0.85      1.00      0.92        63
          11       0.84      0.81      0.83        53
          12       0.91      0.85      0.88        68
          13       0.76      0.96      0.85        55
          14       0.88      0.98      0.93        57
          15       0.90      0.84      0.87        63
          16       0.93      

# Single Input Prediction

In [36]:
print("Predicted label:",random_forest.predict(X_test_scaled[10].reshape(1,-1)))
print("Actual label:",y_test.iloc[10])

Predicted label: [12]
Actual label: 12


# model saving using pickle

In [37]:
import pickle
pickle.dump(scaler,open("model/scaler.pkl","wb"))
pickle.dump(random_forest,open("model/random_forest.pkl","wb"))

In [38]:
scaler=pickle.load(open("model/scaler.pkl","rb"))
model=pickle.load(open("model/random_forest.pkl","rb"))

In [52]:
import numpy as np
class_names=['Lawyer', 'Doctor', 'Government Officer', 'Artist', 'Unknown',
       'Software Engineer', 'Teacher', 'Business Owner', 'Scientist',
       'Banker', 'Writer', 'Accountant', 'Designer',
       'Construction Engineer', 'Game Developer', 'Stock Investor',
       'Real Estate Developer']
def Recommondation(gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,
                   math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,
                   total_score,average):
    # encode categorical variables
    gender_encoded=1 if gender.lower()=="female" else 0
    part_time_job_encoded=1 if part_time_job else 0
    extracurricular_activities_encoded=1 if extracurricular_activities else 0
    #create feature array
    feature_array=np.array([[gender_encoded,part_time_job_encoded,absence_days,extracurricular_activities_encoded,weekly_self_study_hours,
                   math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,
                   total_score,average]])
    scaled_feature=scaler.transform(feature_array)
    #predict using model
    probabilities=random_forest.predict_proba(scaled_feature)
    #get top 5 predicted classes with probability
    top_classes_idx=np.argsort(-probabilities[0])[:5]
    top_classes_names_prob=[(class_names[idx],probabilities[0][idx]) for idx in top_classes_idx]
    return top_classes_names_prob

In [53]:
final_recommendation=Recommondation(gender="female",
                                   part_time_job=False,
                                   absence_days=2,
                                   extracurricular_activities=False,
                                   weekly_self_study_hours=5,
                                   math_score=87,
                                   history_score=65,
                                   physics_score=88,
                                   chemistry_score=55,
                                   biology_score=55,
                                   english_score=44,
                                   geography_score=44,
                                   total_score=554,
                                   average=76.444)
print("Top Recommend Studies with Probabilities:")
print("="*50)
for class_name,probability in final_recommendation:
    print(f"{class_name} with probability {probability}")
    

Top Recommend Studies with Probabilities:
Business Owner with probability 0.59
Game Developer with probability 0.18
Real Estate Developer with probability 0.09
Government Officer with probability 0.06
Accountant with probability 0.04


