In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objects as go

%matplotlib inline

In [2]:
df = pd.read_csv('/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')

In [3]:
print("The data frame has {} rows and {} columns".format(df.shape[0],df.shape[1]))

The data frame has 215 rows and 15 columns


# Head & Tail

In [4]:
df.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [5]:
df.tail()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
210,211,M,80.6,Others,82.0,Others,Commerce,77.6,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed,400000.0
211,212,M,58.0,Others,60.0,Others,Science,72.0,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed,275000.0
212,213,M,67.0,Others,67.0,Others,Commerce,73.0,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0
213,214,F,74.0,Others,66.0,Others,Commerce,58.0,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed,204000.0
214,215,M,62.0,Central,58.0,Others,Science,53.0,Comm&Mgmt,No,89.0,Mkt&HR,60.22,Not Placed,


# Data Cleaning

In [6]:
df.drop('sl_no',axis = 1,inplace = True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   gender          215 non-null    object 
 1   ssc_p           215 non-null    float64
 2   ssc_b           215 non-null    object 
 3   hsc_p           215 non-null    float64
 4   hsc_b           215 non-null    object 
 5   hsc_s           215 non-null    object 
 6   degree_p        215 non-null    float64
 7   degree_t        215 non-null    object 
 8   workex          215 non-null    object 
 9   etest_p         215 non-null    float64
 10  specialisation  215 non-null    object 
 11  mba_p           215 non-null    float64
 12  status          215 non-null    object 
 13  salary          148 non-null    float64
dtypes: float64(6), object(8)
memory usage: 23.6+ KB


In [8]:
#getting the right data types to reduce memory usage

df['gender'] = df['gender'].astype('category')
df['ssc_b'] = df['ssc_b'].astype('category')
df['hsc_b'] = df['hsc_b'].astype('category')
df['hsc_s'] = df['hsc_s'].astype('category')
df['degree_t'] = df['degree_t'].astype('category')
df['workex'] = df['workex'].astype('category')
df['specialisation'] = df['specialisation'].astype('category')
df['status'] = df['status'].astype('category')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   gender          215 non-null    category
 1   ssc_p           215 non-null    float64 
 2   ssc_b           215 non-null    category
 3   hsc_p           215 non-null    float64 
 4   hsc_b           215 non-null    category
 5   hsc_s           215 non-null    category
 6   degree_p        215 non-null    float64 
 7   degree_t        215 non-null    category
 8   workex          215 non-null    category
 9   etest_p         215 non-null    float64 
 10  specialisation  215 non-null    category
 11  mba_p           215 non-null    float64 
 12  status          215 non-null    category
 13  salary          148 non-null    float64 
dtypes: category(8), float64(6)
memory usage: 12.6 KB


In [10]:
df[df['status']=='Not Placed'][['status','salary']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67 entries, 3 to 214
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   status  67 non-null     category
 1   salary  0 non-null      float64 
dtypes: category(1), float64(1)
memory usage: 1.2 KB


That means, all the not placed are marked as Nan. Let's check if any placed is wrongly marked as nan.

In [11]:
df[df['status']=='Placed'][['status','salary']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148 entries, 0 to 213
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   status  148 non-null    category
 1   salary  148 non-null    float64 
dtypes: category(1), float64(1)
memory usage: 2.6 KB


In [12]:
#removing the nan
df['salary'].fillna(0,inplace = True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   gender          215 non-null    category
 1   ssc_p           215 non-null    float64 
 2   ssc_b           215 non-null    category
 3   hsc_p           215 non-null    float64 
 4   hsc_b           215 non-null    category
 5   hsc_s           215 non-null    category
 6   degree_p        215 non-null    float64 
 7   degree_t        215 non-null    category
 8   workex          215 non-null    category
 9   etest_p         215 non-null    float64 
 10  specialisation  215 non-null    category
 11  mba_p           215 non-null    float64 
 12  status          215 non-null    category
 13  salary          215 non-null    float64 
dtypes: category(8), float64(6)
memory usage: 12.6 KB


So the data looks pretty clean now.
Lets move towards the EDA

# EDA and Visualisations

Let's start off the the categorical data and then we can further proceed towards the numerical data

In [14]:
data_gen = df.groupby(['gender','status']).count()['salary'].reset_index()
data_gen.columns = ['gender','status','count']
fig = px.bar(data_gen,x = 'gender',y = 'count',color = 'status',barmode = 'group')
fig.update_layout(width = 800,title  = 'Gender wise Placements')
fig.show()

In [15]:
fig = px.box(df[df['status']=='Placed'],x = 'gender',y = 'salary',color = 'gender',width = 500,title = 'Gender vs Salary')
fig.show()

In [16]:
fig = px.box(df[df['status']=='Placed'],x = 'ssc_b',y = 'salary',color = 'ssc_b',width = 500,title = 'SSC Board vs Salary')
fig.show()

In [17]:
fig = px.box(df[df['status']=='Placed'],x = 'hsc_s',y = 'salary',color = 'hsc_s',width = 500,title = 'SSC Board vs Salary')
fig.show()

In [18]:
data_gen = df.groupby(['hsc_s','status']).count()['salary'].reset_index()
data_gen.columns = ['hsc_s','status','count']
fig = px.bar(data_gen,x = 'hsc_s',y = 'count',color = 'status',barmode = 'group')
fig.update_layout(width = 800,title  = 'Stream wise Placements')
fig.show()

In [19]:
data_gen = df.groupby(['workex','status']).count()['salary'].reset_index()
data_gen.columns = ['workex','status','count']
fig = px.bar(data_gen,x = 'workex',y = 'count',color = 'status',barmode = 'group')
fig.update_layout(width = 800,title  = 'Work Expirence dependency on Placements')
fig.show()

In [20]:
fig = px.box(df[df['status']=='Placed'],x = 'specialisation',y = 'salary',color = 'specialisation',width = 500,title = 'Specialisation vs Salary')
fig.show()