In [35]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier

In [36]:
#load the data set
df = pd.read_csv('MLProject.csv')
df.head(5)

Unnamed: 0,Sr.No.,Course,Job profession,Student,Linguistic,Musical,Bodily,Logical - Mathematical,Spatial-Visualization,Interpersonal,...,Naturalist,s/p,P1,P2,P3,P4,P5,P6,P7,P8
0,1.0,,Astronomer\n,S1,11.0,5.0,12.0,16.0,17.0,11.0,...,19.0,s1,AVG,POOR,AVG,BEST,BEST,AVG,BEST,BEST
1,,,Astronomer\n,S2,12.0,6.0,12.0,16.0,16.0,11.0,...,19.0,s2,AVG,POOR,AVG,BEST,BEST,AVG,BEST,BEST
2,,,Astronomer\n,S3,13.0,7.0,12.0,16.0,15.0,11.0,...,19.0,s3,AVG,POOR,AVG,BEST,BEST,AVG,BEST,BEST
3,,,Astronomer\n,S4,14.0,8.0,12.0,16.0,19.0,11.0,...,19.0,s4,AVG,POOR,AVG,BEST,BEST,AVG,BEST,BEST
4,,,Astronomer\n,S5,13.0,9.0,12.0,16.0,20.0,11.0,...,19.0,s5,AVG,POOR,AVG,BEST,BEST,AVG,BEST,BEST


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3999 entries, 0 to 3998
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Sr.No.                  72 non-null     float64
 1   Course                  0 non-null      float64
 2   Job profession          3600 non-null   object 
 3   Student                 3600 non-null   object 
 4   Linguistic              3600 non-null   float64
 5   Musical                 3600 non-null   float64
 6   Bodily                  3600 non-null   float64
 7   Logical - Mathematical  3600 non-null   float64
 8   Spatial-Visualization   3600 non-null   float64
 9   Interpersonal           3600 non-null   float64
 10  Intrapersonal           3600 non-null   float64
 11  Naturalist              3600 non-null   float64
 12  s/p                     3600 non-null   object 
 13  P1                      3600 non-null   object 
 14  P2                      3600 non-null   

In [38]:
df.isnull().sum()


Unnamed: 0,0
Sr.No.,3927
Course,3999
Job profession,399
Student,399
Linguistic,399
Musical,399
Bodily,399
Logical - Mathematical,399
Spatial-Visualization,399
Interpersonal,399


In [39]:
df.columns

Index(['Sr.No.', 'Course', 'Job profession', 'Student', 'Linguistic',
       'Musical', 'Bodily', 'Logical - Mathematical', 'Spatial-Visualization',
       'Interpersonal', 'Intrapersonal', 'Naturalist', 's/p', 'P1', 'P2', 'P3',
       'P4', 'P5', 'P6', 'P7', 'P8'],
      dtype='object')

In [40]:
df['Course'].isnull().sum()


3999

In [41]:
df['Job profession'].isnull().sum()


399

In [42]:
df_cleaned = df.drop(columns=['Course','Sr.No.'])
#I drop this because in this column so how maximum values are missing.

In [43]:
columns_to_check = [
    'Job profession', 'Student', 'Linguistic', 'Musical', 'Bodily',
    'Logical - Mathematical', 'Spatial-Visualization', 'Interpersonal',
    'Intrapersonal', 'Naturalist', 's/p', 'P1', 'P2', 'P3', 'P4',
    'P5', 'P6', 'P7', 'P8'
]

df_cleaned.dropna(subset=columns_to_check, inplace=True)


In [44]:
df_cleaned.isnull().sum()

Unnamed: 0,0
Job profession,0
Student,0
Linguistic,0
Musical,0
Bodily,0
Logical - Mathematical,0
Spatial-Visualization,0
Interpersonal,0
Intrapersonal,0
Naturalist,0


In [45]:
label_encoder = LabelEncoder()

In [46]:
df_cleaned.select_dtypes(include=['object']).columns.tolist()

['Job profession',
 'Student',
 's/p',
 'P1',
 'P2',
 'P3',
 'P4',
 'P5',
 'P6',
 'P7',
 'P8']

In [47]:
categorical_cols = df_cleaned.select_dtypes(include=['object']).columns.tolist()

In [48]:
for col in categorical_cols:
    df_cleaned[col] = label_encoder.fit_transform(df_cleaned[col])

In [49]:
df_cleaned.head(5)

Unnamed: 0,Job profession,Student,Linguistic,Musical,Bodily,Logical - Mathematical,Spatial-Visualization,Interpersonal,Intrapersonal,Naturalist,s/p,P1,P2,P3,P4,P5,P6,P7,P8
0,5,0,11.0,5.0,12.0,16.0,17.0,11.0,18.0,19.0,0,0,2,0,1,1,0,1,1
1,5,1105,12.0,6.0,12.0,16.0,16.0,11.0,18.0,19.0,1105,0,2,0,1,1,0,1,1
2,5,2216,13.0,7.0,12.0,16.0,15.0,11.0,18.0,19.0,2216,0,2,0,1,1,0,1,1
3,5,2928,14.0,8.0,12.0,16.0,19.0,11.0,18.0,19.0,2928,0,2,0,1,1,0,1,1
4,5,3039,13.0,9.0,12.0,16.0,20.0,11.0,19.0,19.0,3039,0,2,0,1,1,0,1,1


Allan On this I am trying to predicte the Job profession on the basis of other features.

In [50]:
# Define features (X) and target (y)
X = df_cleaned[['Student', 'Linguistic', 'Musical', 'Bodily', 'Logical - Mathematical',
                'Spatial-Visualization', 'Interpersonal', 'Intrapersonal',
                'Naturalist', 's/p', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8']]
y = df_cleaned['Job profession']


In [51]:

class_distribution = df_cleaned['Job profession'].value_counts()
print(class_distribution)

Job profession
5     50
27    50
64    50
37    50
40    50
      ..
41    50
54    50
69    50
24    49
20    44
Name: count, Length: 72, dtype: int64


In [52]:
# split data randomly into 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, # Call train_test_split directly
                                                                    test_size=0.3, random_state=0)

In [53]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [54]:
y_pred = knn.predict(X_test)

In [55]:
y_pred

array([60, 25, 38, ..., 34, 30, 43])

In [56]:
y_test.values

array([60, 25, 38, ..., 34, 30, 43])

In [57]:
# Import the accuracy_score function from sklearn.metrics
from sklearn.metrics import accuracy_score
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

Accuracy: 0.90


In [58]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[12  0  0 ...  0  0  0]
 [ 0 20  0 ...  0  0  0]
 [ 0  0 15 ...  0  0  0]
 ...
 [ 0  0  0 ... 18  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0 10]]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        12
           1       0.83      1.00      0.91        20
           2       0.94      1.00      0.97        15
           3       0.79      1.00      0.88        15
           4       0.85      1.00      0.92        11
           5       0.00      0.00      0.00        19
           6       0.85      1.00      0.92        17
           7       0.00      0.00      0.00        18
           8       0.95      1.00      0.97        18
           9       0.89      1.00      0.94        17
          10       1.00      1.00      1.00        13
          11       1.00      1.00      1.00         9
          12       0.96      1.00      0.98        22
          13       0.92      1.00      0.96        12
          14       0.87      1.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [59]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred, output_dict=True)

# Print only precision, recall, and F1-score for each class
for label, metrics in report.items():
    if label != 'accuracy':  # Remove the accuracy row
        print(f"Class: {label}")
        print(f"  Precision: {metrics['precision']}")
        print(f"  Recall: {metrics['recall']}")
        print(f"  F1-Score: {metrics['f1-score']}")
        print()


Class: 0
  Precision: 0.9230769230769231
  Recall: 1.0
  F1-Score: 0.96

Class: 1
  Precision: 0.8333333333333334
  Recall: 1.0
  F1-Score: 0.9090909090909091

Class: 2
  Precision: 0.9375
  Recall: 1.0
  F1-Score: 0.967741935483871

Class: 3
  Precision: 0.7894736842105263
  Recall: 1.0
  F1-Score: 0.8823529411764706

Class: 4
  Precision: 0.8461538461538461
  Recall: 1.0
  F1-Score: 0.9166666666666666

Class: 5
  Precision: 0.0
  Recall: 0.0
  F1-Score: 0.0

Class: 6
  Precision: 0.85
  Recall: 1.0
  F1-Score: 0.918918918918919

Class: 7
  Precision: 0.0
  Recall: 0.0
  F1-Score: 0.0

Class: 8
  Precision: 0.9473684210526315
  Recall: 1.0
  F1-Score: 0.972972972972973

Class: 9
  Precision: 0.8947368421052632
  Recall: 1.0
  F1-Score: 0.9444444444444444

Class: 10
  Precision: 1.0
  Recall: 1.0
  F1-Score: 1.0

Class: 11
  Precision: 1.0
  Recall: 1.0
  F1-Score: 1.0

Class: 12
  Precision: 0.9565217391304348
  Recall: 1.0
  F1-Score: 0.9777777777777777

Class: 13
  Precision: 0.9230

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
