# Data Preprocessing Steps

In [1]:
import pandas as pd

### Data Collection

In [2]:
dataset=pd.read_csv("Placement.csv")

In [3]:
dataset

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed,250000.0
3,4,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed,400000.0
211,212,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed,275000.0
212,213,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0
213,214,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed,204000.0


### Check the number of Missing Values

In [10]:
dataset.isnull().sum()  
# else -> dataset.isna().sum()

sl_no              0
gender             0
ssc_p              0
ssc_b              0
hsc_p              0
hsc_b              0
hsc_s              0
degree_p           0
degree_t           0
workex             0
etest_p            0
specialisation     0
mba_p              0
status             0
salary            67
dtype: int64

### Fill all the Missing values with 0 in particular column
dataset["salary"].fillna(0,inplace = True) 

### Delete all the Missing values 
dataset.dropna(inplace=True)

dataset

### Replace the Missing values with central tendency based on the Problem Statement

dataset['salary'].fillna(dataset["salary"].mean(),inplace=True)

dataset

### Using quanQual Separation Function

In [6]:
from Univariate_Analysis import Univariate
quan,qual = Univariate.quanQual(dataset)
quan


['sl_no', 'ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p', 'salary']

### Imputation of Missing Values using NumPy  -->  It changes all the missing values of irrespective columns

In [7]:

# SimpleImputer Parameters
# missing_values => int, float, str, np.nan, None or pandas.NA, default=np.nan
# strategy => default=’mean’ , 'median',                         -->  only Numeric
#           'most_frequent', 'constant' along with fill_value,   --> both categorical & Numeric

import numpy as np
from sklearn.impute import SimpleImputer
Imputation = SimpleImputer(missing_values=np.nan, strategy = 'constant', fill_value = 0)
Imputation.fit(dataset[quan])

# transform() is a method that applies the imputation strategy defined when the Imputation instance was created.
# This method takes in a dataset and returns the dataset with the missing values replaced (or "imputed").

df = Imputation.transform(dataset[quan])     # Output in array format
dataset_withoutNULL = pd.DataFrame(df,columns = quan)

In [8]:
Imputation

### All the Missing values in salary column are replaced with 'constant = 0' based on the Problem Statement

In [11]:
dataset_withoutNULL

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary
0,1.0,67.00,91.00,58.00,55.0,58.80,270000.0
1,2.0,79.33,78.33,77.48,86.5,66.28,200000.0
2,3.0,65.00,68.00,64.00,75.0,57.80,250000.0
3,4.0,56.00,52.00,52.00,66.0,59.43,0.0
4,5.0,85.80,73.60,73.30,96.8,55.50,425000.0
...,...,...,...,...,...,...,...
210,211.0,80.60,82.00,77.60,91.0,74.49,400000.0
211,212.0,58.00,60.00,72.00,74.0,53.62,275000.0
212,213.0,67.00,67.00,73.00,59.0,69.72,295000.0
213,214.0,74.00,66.00,58.00,70.0,60.23,204000.0
