In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from scipy.stats import kurtosis,skew

from matplotlib.colors import LinearSegmentedColormap

In [2]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv('Campus Placement.csv')
data

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed,270000.0
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed,250000.0
3,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed,400000.0
211,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed,275000.0
212,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0
213,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed,204000.0


### Column Description

-   gender
-   ssc_p -> 10th class percentage
-   ssc_b -> 10th class board
-   hsc_p -> 12th class percentage
-   hsc_b -> 12th class board
-   hsc_s -> 12th class branch
-   degree_p -> undergraduate degree percentage
-   degree_t -> degree branch
-   workex -> work experience
-   etest_p -> Entrance test percentage
-   specialisation -> MBA branch
-   mba_p -> MBA percentage
-   status -> placement status
-   salary -> quoted salary

In [4]:
data.rename(columns={'specialisation':'mba_t'}, inplace=True)

In [5]:
data.duplicated().sum()

0

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   gender    215 non-null    object 
 1   ssc_p     215 non-null    float64
 2   ssc_b     215 non-null    object 
 3   hsc_p     215 non-null    float64
 4   hsc_b     215 non-null    object 
 5   hsc_s     215 non-null    object 
 6   degree_p  215 non-null    float64
 7   degree_t  215 non-null    object 
 8   workex    215 non-null    object 
 9   etest_p   215 non-null    float64
 10  mba_t     215 non-null    object 
 11  mba_p     215 non-null    float64
 12  status    215 non-null    object 
 13  salary    148 non-null    float64
dtypes: float64(6), object(8)
memory usage: 23.6+ KB


1. salary has null values

In [7]:
data.fillna(0,inplace=True)

Filling them with zeros, which is acceptable technically and avoid NaN's

In [8]:
data['salary'].isna().sum()

0

In [9]:
data

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,mba_t,mba_p,status,salary
0,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed,270000.0
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed,250000.0
3,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,0.0
4,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed,400000.0
211,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed,275000.0
212,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0
213,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed,204000.0


In [10]:
data['degree_t'] = data['degree_t'].replace('Comm&Mgmt', 'CommAndMgmt').replace('Sci&Tech', 'SciAndTech')
data['mba_t'] = data['mba_t'].replace('Mkt&Fin', 'MktAndFin').replace('Mkt&HR', 'MktAndHR')
data

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,mba_t,mba_p,status,salary
0,M,67.00,Others,91.00,Others,Commerce,58.00,SciAndTech,No,55.0,MktAndHR,58.80,Placed,270000.0
1,M,79.33,Central,78.33,Others,Science,77.48,SciAndTech,Yes,86.5,MktAndFin,66.28,Placed,200000.0
2,M,65.00,Central,68.00,Central,Arts,64.00,CommAndMgmt,No,75.0,MktAndFin,57.80,Placed,250000.0
3,M,56.00,Central,52.00,Central,Science,52.00,SciAndTech,No,66.0,MktAndHR,59.43,Not Placed,0.0
4,M,85.80,Central,73.60,Central,Commerce,73.30,CommAndMgmt,No,96.8,MktAndFin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,M,80.60,Others,82.00,Others,Commerce,77.60,CommAndMgmt,No,91.0,MktAndFin,74.49,Placed,400000.0
211,M,58.00,Others,60.00,Others,Science,72.00,SciAndTech,No,74.0,MktAndFin,53.62,Placed,275000.0
212,M,67.00,Others,67.00,Others,Commerce,73.00,CommAndMgmt,Yes,59.0,MktAndFin,69.72,Placed,295000.0
213,F,74.00,Others,66.00,Others,Commerce,58.00,CommAndMgmt,No,70.0,MktAndHR,60.23,Placed,204000.0


In [11]:
dataset = data[data['salary'] != 0]