In [25]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [26]:
result_df=pd.read_csv("candidate_result.csv")
result_df_required=result_df.drop("sl_no",axis=1)
result_df_required.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,overall_result
0,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Pass
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Pass
2,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Pass
3,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Fail
4,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Pass


In [27]:
status_pass=result_df_required[result_df_required['overall_result']=='Pass']
status_fail=result_df_required[result_df_required['overall_result']=='Fail']
status_fail.shape

(67, 13)

In [28]:
predictor_df=result_df_required.drop('overall_result',axis=1)
target_df=result_df_required[['overall_result']]
target_df

Unnamed: 0,overall_result
0,Pass
1,Pass
2,Pass
3,Fail
4,Pass
...,...
210,Pass
211,Pass
212,Pass
213,Pass


In [29]:
ros = RandomOverSampler(random_state=23)
x_ros, y_ros = ros.fit_resample(predictor_df, target_df)
y_ros.value_counts()


overall_result
Fail              148
Pass              148
Name: count, dtype: int64

In [30]:
enc=LabelEncoder()
y_ros['overall_result_binary']=enc.fit_transform(y_ros['overall_result'])

In [31]:
y_ros.head()

Unnamed: 0,overall_result,overall_result_binary
0,Pass,1
1,Pass,1
2,Pass,1
3,Fail,0
4,Pass,1


In [32]:
enc1=LabelEncoder()
x_ros['workex_binary']=enc1.fit_transform(x_ros['workex'])
x_ros

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,workex_binary
0,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.00,Mkt&HR,58.80,0
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.50,Mkt&Fin,66.28,1
2,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.00,Mkt&Fin,57.80,0
3,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.00,Mkt&HR,59.43,0
4,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.80,Mkt&Fin,55.50,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,F,63.40,Others,67.20,Others,Commerce,60.00,Comm&Mgmt,No,58.06,Mkt&HR,69.28,0
292,M,52.00,Central,57.00,Central,Commerce,50.80,Comm&Mgmt,No,67.00,Mkt&HR,62.79,0
293,M,61.08,Others,50.00,Others,Science,54.00,Sci&Tech,No,71.00,Mkt&Fin,65.69,0
294,M,52.00,Central,63.00,Others,Science,65.00,Sci&Tech,Yes,86.00,Mkt&HR,56.09,1


In [33]:
x_ros.drop('workex',axis=1,inplace=True)

In [34]:
ordinal_list=['Central','Others']
ct=ColumnTransformer([('ohe',OneHotEncoder(drop='first'),['gender', 'hsc_s', 'degree_t', 'specialisation']),
                     ('oe',OrdinalEncoder(categories=[ordinal_list,ordinal_list]),['ssc_b','hsc_b']),
                     ],remainder='passthrough')

In [35]:
x_encoded=ct.fit_transform(x_ros)
x_encoded[1]

array([ 1.  ,  0.  ,  1.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  , 79.33,
       78.33, 77.48, 86.5 , 66.28,  1.  ])

In [36]:
x_encoded[291]

array([ 0.  ,  1.  ,  0.  ,  0.  ,  0.  ,  1.  ,  1.  ,  1.  , 63.4 ,
       67.2 , 60.  , 58.06, 69.28,  0.  ])

In [37]:
x_encoded

array([[ 1.  ,  1.  ,  0.  , ..., 55.  , 58.8 ,  0.  ],
       [ 1.  ,  0.  ,  1.  , ..., 86.5 , 66.28,  1.  ],
       [ 1.  ,  0.  ,  0.  , ..., 75.  , 57.8 ,  0.  ],
       ...,
       [ 1.  ,  0.  ,  1.  , ..., 71.  , 65.69,  0.  ],
       [ 1.  ,  0.  ,  1.  , ..., 86.  , 56.09,  1.  ],
       [ 1.  ,  0.  ,  1.  , ..., 66.  , 59.43,  0.  ]])

In [38]:
y_ros

Unnamed: 0,overall_result,overall_result_binary
0,Pass,1
1,Pass,1
2,Pass,1
3,Fail,0
4,Pass,1
...,...,...
291,Fail,0
292,Fail,0
293,Fail,0
294,Fail,0


In [39]:
X_train,X_test,y_train,y_test=train_test_split(x_encoded,y_ros[['overall_result_binary']],test_size=0.30,random_state=15)
X_train

array([[ 1.  ,  0.  ,  1.  , ..., 60.  , 61.87,  0.  ],
       [ 0.  ,  1.  ,  0.  , ..., 95.65, 66.94,  0.  ],
       [ 1.  ,  1.  ,  0.  , ..., 68.71, 60.99,  0.  ],
       ...,
       [ 1.  ,  0.  ,  1.  , ..., 80.  , 52.72,  1.  ],
       [ 1.  ,  1.  ,  0.  , ..., 65.  , 60.98,  1.  ],
       [ 1.  ,  0.  ,  1.  , ..., 89.  , 60.22,  0.  ]])

In [40]:
X_train[0]

array([ 1.  ,  0.  ,  1.  ,  0.  ,  1.  ,  0.  ,  1.  ,  1.  , 67.  ,
       63.  , 64.  , 60.  , 61.87,  0.  ])

In [41]:
y_test.value_counts()

overall_result_binary
0                        46
1                        43
Name: count, dtype: int64

In [42]:
predictore1_df=pd.read_csv('trial.csv')
predictore1_df

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,etest_p,specialisation,mba_p,workex
0,F,82,Central,75,Central,Commerce,76,Comm&Mgmt,54.96,Mkt&Fin,76,Yes


In [43]:
enc.classes_

array(['Fail', 'Pass'], dtype=object)

In [44]:
predictore1_df['workex_binary']=enc1.transform(predictore1_df['workex'])
predictore1_df.drop('workex',axis=1,inplace=True)
predictore1_df

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,etest_p,specialisation,mba_p,workex_binary
0,F,82,Central,75,Central,Commerce,76,Comm&Mgmt,54.96,Mkt&Fin,76,1


In [45]:
predictore1_df_encodedp=ct.transform(predictore1_df)
predictore1_df_encodedp

array([[ 0.  ,  1.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  , 82.  ,
        75.  , 76.  , 54.96, 76.  ,  1.  ]])

In [46]:
ct

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [47]:
x_ros

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,etest_p,specialisation,mba_p,workex_binary
0,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,55.00,Mkt&HR,58.80,0
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,86.50,Mkt&Fin,66.28,1
2,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,75.00,Mkt&Fin,57.80,0
3,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,66.00,Mkt&HR,59.43,0
4,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,96.80,Mkt&Fin,55.50,0
...,...,...,...,...,...,...,...,...,...,...,...,...
291,F,63.40,Others,67.20,Others,Commerce,60.00,Comm&Mgmt,58.06,Mkt&HR,69.28,0
292,M,52.00,Central,57.00,Central,Commerce,50.80,Comm&Mgmt,67.00,Mkt&HR,62.79,0
293,M,61.08,Others,50.00,Others,Science,54.00,Sci&Tech,71.00,Mkt&Fin,65.69,0
294,M,52.00,Central,63.00,Others,Science,65.00,Sci&Tech,86.00,Mkt&HR,56.09,1
