In [64]:
import numpy as np
import pandas as pd 
# import os
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix

# from sklearn.model_selection import cross_val_score

In [47]:
data = pd.read_csv('Job_Placement_Data.csv')
data.head()

Unnamed: 0,gender,ssc_percentage,ssc_board,hsc_percentage,hsc_board,hsc_subject,degree_percentage,undergrad_degree,work_experience,emp_test_percentage,specialisation,mba_percent,status
0,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed
2,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed
3,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed
4,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed


### EDA

In [48]:
# Here User ID is not suitable to predict the results,so we are ignore this coloumn
data = data[['gender','degree_percentage','work_experience','emp_test_percentage','status']]

In [49]:
print(data.head())

  gender  degree_percentage work_experience  emp_test_percentage      status
0      M              58.00              No                 55.0      Placed
1      M              77.48             Yes                 86.5      Placed
2      M              64.00              No                 75.0      Placed
3      M              52.00              No                 66.0  Not Placed
4      M              73.30              No                 96.8      Placed


In [52]:
data.isnull().sum()

gender                 0
degree_percentage      0
work_experience        0
emp_test_percentage    0
status                 0
dtype: int64

In [53]:
data.shape

(215, 5)

In [57]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               215 non-null    object 
 1   degree_percentage    215 non-null    float64
 2   work_experience      215 non-null    object 
 3   emp_test_percentage  215 non-null    float64
 4   status               215 non-null    object 
dtypes: float64(2), object(3)
memory usage: 8.5+ KB


In [58]:
for i in ['status','work_experience','gender']:
    print('column ',i,' : ',data[i].unique())

column  status  :  ['Placed' 'Not Placed']
column  work_experience  :  ['No' 'Yes']
column  gender  :  ['M' 'F']


In [59]:
data.corr()

Unnamed: 0,degree_percentage,emp_test_percentage
degree_percentage,1.0,0.22447
emp_test_percentage,0.22447,1.0


In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               215 non-null    object 
 1   degree_percentage    215 non-null    float64
 2   work_experience      215 non-null    object 
 3   emp_test_percentage  215 non-null    float64
 4   status               215 non-null    object 
dtypes: float64(2), object(3)
memory usage: 8.5+ KB


### Feature engineering

In [63]:
# select all categorical variables
dataset_categorical = data.select_dtypes(include=['object'])
dataset_categorical.head()

Unnamed: 0,gender,work_experience,status
0,M,No,Placed
1,M,Yes,Placed
2,M,No,Placed
3,M,No,Not Placed
4,M,No,Placed


In [65]:
label_encoder = LabelEncoder()
dataset_categorical = dataset_categorical.apply(label_encoder.fit_transform)
dataset_categorical.head()

Unnamed: 0,gender,work_experience,status
0,1,0,1
1,1,1,1
2,1,0,1
3,1,0,0
4,1,0,1


In [66]:
df = data.drop(dataset_categorical.columns, axis=1)# drop categorical
df = pd.concat([df, dataset_categorical], axis=1)# concat for dataframe
df.head()

Unnamed: 0,degree_percentage,emp_test_percentage,gender,work_experience,status
0,58.0,55.0,1,0,1
1,77.48,86.5,1,1,1
2,64.0,75.0,1,0,1
3,52.0,66.0,1,0,0
4,73.3,96.8,1,0,1


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   degree_percentage    215 non-null    float64
 1   emp_test_percentage  215 non-null    float64
 2   gender               215 non-null    int64  
 3   work_experience      215 non-null    int64  
 4   status               215 non-null    int64  
dtypes: float64(2), int64(3)
memory usage: 8.5 KB


In [68]:
# convert target variable status to categorical
df['status'] = df['status'].astype('category')
df['status'].head()

0    1
1    1
2    1
3    0
4    1
Name: status, dtype: category
Categories (2, int64): [0, 1]

In [69]:
df['status'].value_counts()

1    148
0     67
Name: status, dtype: int64

In [70]:
X = df.drop(columns='status',axis=1)
y = df['status']

In [72]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0,test_size=0.3)

### Preprocessing

In [73]:
s = StandardScaler()
X_train = s.fit_transform(X_train)
X_test = s.fit_transform(X_test)

In [74]:
log = LogisticRegression()
log.fit(X_train,y_train)
predict = log.predict(X_test)

### Calculate Accuracy Of our model

In [75]:
print(confusion_matrix(y_test,predict))

[[12  7]
 [ 4 42]]


In [76]:
print(classification_report(y_test,predict))

              precision    recall  f1-score   support

           0       0.75      0.63      0.69        19
           1       0.86      0.91      0.88        46

    accuracy                           0.83        65
   macro avg       0.80      0.77      0.78        65
weighted avg       0.83      0.83      0.83        65

