In [1]:
import pandas as pd
from StudentPlacementMetadata import StudentPlacementMetadata
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
dataset = pd.read_csv("Student-Placement.csv")

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
dataset

Unnamed: 0,College_ID,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Internship_Experience,Extra_Curricular_Score,Communication_Skills,Projects_Completed,Placement
0,CLG0030,107,6.61,6.28,8,No,8,8,4,No
1,CLG0061,97,5.52,5.37,8,No,7,8,0,No
2,CLG0036,109,5.36,5.83,9,No,3,1,1,No
3,CLG0055,122,5.47,5.75,6,Yes,1,6,1,No
4,CLG0004,96,7.91,7.69,7,No,8,10,2,No
...,...,...,...,...,...,...,...,...,...,...
9995,CLG0021,119,8.41,8.29,4,No,1,8,0,Yes
9996,CLG0098,70,9.25,9.34,7,No,0,7,2,No
9997,CLG0066,89,6.08,6.25,3,Yes,3,9,5,No
9998,CLG0045,107,8.77,8.92,3,No,7,5,1,No


# Data Preprocessing

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   College_ID              10000 non-null  object 
 1   IQ                      10000 non-null  int64  
 2   Prev_Sem_Result         10000 non-null  float64
 3   CGPA                    10000 non-null  float64
 4   Academic_Performance    10000 non-null  int64  
 5   Internship_Experience   10000 non-null  object 
 6   Extra_Curricular_Score  10000 non-null  int64  
 7   Communication_Skills    10000 non-null  int64  
 8   Projects_Completed      10000 non-null  int64  
 9   Placement               10000 non-null  object 
dtypes: float64(2), int64(5), object(3)
memory usage: 781.4+ KB


In [7]:
quant, qual = StudentPlacementMetadata.quanQual(dataset)

In [8]:
quant

['IQ',
 'Prev_Sem_Result',
 'CGPA',
 'Academic_Performance',
 'Extra_Curricular_Score',
 'Communication_Skills',
 'Projects_Completed']

In [9]:
qual

['College_ID', 'Internship_Experience', 'Placement']

In [10]:
dataset.describe()

Unnamed: 0,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Extra_Curricular_Score,Communication_Skills,Projects_Completed
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,99.4718,7.535673,7.532379,5.5464,4.9709,5.5618,2.5134
std,15.053101,1.447519,1.470141,2.873477,3.160103,2.900866,1.715959
min,41.0,5.0,4.54,1.0,0.0,1.0,0.0
25%,89.0,6.29,6.29,3.0,2.0,3.0,1.0
50%,99.0,7.56,7.55,6.0,5.0,6.0,3.0
75%,110.0,8.79,8.77,8.0,8.0,8.0,4.0
max,158.0,10.0,10.46,10.0,10.0,10.0,5.0


In [11]:
univariate = pd.DataFrame(index=["Q1:25","Q2:50","Q3:75","Q4:100","IQR","1.5Rule","Lesser","Greater","Min","Max"],columns=quant)
for columnName in quant:
    univariate.loc["Q1:25", [columnName]] = dataset.describe()[columnName]["25%"]
    univariate.loc["Q2:50", [columnName]] = dataset.describe()[columnName]["50%"]
    univariate.loc["Q3:75", [columnName]] = dataset.describe()[columnName]["75%"]
    univariate.loc["Q4:100", [columnName]] = dataset.describe()[columnName]["max"]
    univariate.loc["IQR", [columnName]] = univariate.loc["Q3:75", [columnName]] - univariate.loc["Q1:25", [columnName]]
    univariate.loc["1.5Rule", [columnName]] = 1.5* univariate.loc["IQR", [columnName]]
    univariate.loc["Lesser", [columnName]] = univariate.loc["Q1:25", [columnName]] - univariate.loc["1.5Rule", [columnName]]
    univariate.loc["Greater", [columnName]] = univariate.loc["Q3:75", [columnName]] + univariate.loc["1.5Rule", [columnName]]
    univariate.loc["Min", [columnName]] = dataset[columnName].min()
    univariate.loc["Max", [columnName]] = dataset[columnName].max()

In [12]:
univariate

Unnamed: 0,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Extra_Curricular_Score,Communication_Skills,Projects_Completed
Q1:25,89.0,6.29,6.29,3.0,2.0,3.0,1.0
Q2:50,99.0,7.56,7.55,6.0,5.0,6.0,3.0
Q3:75,110.0,8.79,8.77,8.0,8.0,8.0,4.0
Q4:100,158.0,10.0,10.46,10.0,10.0,10.0,5.0
IQR,21.0,2.5,2.48,5.0,6.0,5.0,3.0
1.5Rule,31.5,3.75,3.72,7.5,9.0,7.5,4.5
Lesser,57.5,2.54,2.57,-4.5,-7.0,-4.5,-3.5
Greater,141.5,12.54,12.49,15.5,17.0,15.5,8.5
Min,41.0,5.0,4.54,1.0,0.0,1.0,0.0
Max,158.0,10.0,10.46,10.0,10.0,10.0,5.0


In [13]:
lesser=[]
greater=[]
for columnName in univariate:
    if(univariate[columnName]["Min"]<univariate[columnName]["Lesser"]):
        lesser.append(columnName)
    if(univariate[columnName]["Max"]>univariate[columnName]["Greater"]):
        greater.append(columnName)

In [14]:
lesser

['IQ']

In [15]:
greater

['IQ']

In [16]:
for column in lesser:
    # Convert to float so decimals (57.5 etc.) are allowed
    dataset[column] = dataset[column].astype(float)

    dataset.loc[
        dataset[column] < univariate[column]["Lesser"],
        column
    ] = float(univariate[column]["Lesser"])

for column in greater:
    dataset[column] = dataset[column].astype(float)

    dataset.loc[
        dataset[column] > univariate[column]["Greater"],
        column
    ] = float(univariate[column]["Greater"])


In [17]:
dataset

Unnamed: 0,College_ID,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Internship_Experience,Extra_Curricular_Score,Communication_Skills,Projects_Completed,Placement
0,CLG0030,107.0,6.61,6.28,8,No,8,8,4,No
1,CLG0061,97.0,5.52,5.37,8,No,7,8,0,No
2,CLG0036,109.0,5.36,5.83,9,No,3,1,1,No
3,CLG0055,122.0,5.47,5.75,6,Yes,1,6,1,No
4,CLG0004,96.0,7.91,7.69,7,No,8,10,2,No
...,...,...,...,...,...,...,...,...,...,...
9995,CLG0021,119.0,8.41,8.29,4,No,1,8,0,Yes
9996,CLG0098,70.0,9.25,9.34,7,No,0,7,2,No
9997,CLG0066,89.0,6.08,6.25,3,Yes,3,9,5,No
9998,CLG0045,107.0,8.77,8.92,3,No,7,5,1,No


In [18]:
univariate = pd.DataFrame(index=["Q1:25","Q2:50","Q3:75","Q4:100","IQR","1.5Rule","Lesser","Greater","Min","Max"],columns=quant)
for columnName in quant:
    univariate.loc["Q1:25", [columnName]] = dataset.describe()[columnName]["25%"]
    univariate.loc["Q2:50", [columnName]] = dataset.describe()[columnName]["50%"]
    univariate.loc["Q3:75", [columnName]] = dataset.describe()[columnName]["75%"]
    univariate.loc["Q4:100", [columnName]] = dataset.describe()[columnName]["max"]
    univariate.loc["IQR", [columnName]] = univariate.loc["Q3:75", [columnName]] - univariate.loc["Q1:25", [columnName]]
    univariate.loc["1.5Rule", [columnName]] = 1.5* univariate.loc["IQR", [columnName]]
    univariate.loc["Lesser", [columnName]] = univariate.loc["Q1:25", [columnName]] - univariate.loc["1.5Rule", [columnName]]
    univariate.loc["Greater", [columnName]] = univariate.loc["Q3:75", [columnName]] + univariate.loc["1.5Rule", [columnName]]
    univariate.loc["Min", [columnName]] = dataset[columnName].min()
    univariate.loc["Max", [columnName]] = dataset[columnName].max()

In [19]:
univariate

Unnamed: 0,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Extra_Curricular_Score,Communication_Skills,Projects_Completed
Q1:25,89.0,6.29,6.29,3.0,2.0,3.0,1.0
Q2:50,99.0,7.56,7.55,6.0,5.0,6.0,3.0
Q3:75,110.0,8.79,8.77,8.0,8.0,8.0,4.0
Q4:100,141.5,10.0,10.46,10.0,10.0,10.0,5.0
IQR,21.0,2.5,2.48,5.0,6.0,5.0,3.0
1.5Rule,31.5,3.75,3.72,7.5,9.0,7.5,4.5
Lesser,57.5,2.54,2.57,-4.5,-7.0,-4.5,-3.5
Greater,141.5,12.54,12.49,15.5,17.0,15.5,8.5
Min,57.5,5.0,4.54,1.0,0.0,1.0,0.0
Max,141.5,10.0,10.46,10.0,10.0,10.0,5.0


In [20]:
lesser=[]
greater=[]
for columnName in univariate:
    if(univariate[columnName]["Min"]<univariate[columnName]["Lesser"]):
        lesser.append(columnName)
    if(univariate[columnName]["Max"]>univariate[columnName]["Greater"]):
        greater.append(columnName)

In [21]:
lesser

[]

In [22]:
greater

[]

In [23]:
le = LabelEncoder()

# Apply label encoding
dataset['College_ID'] = le.fit_transform(dataset['College_ID'])

In [24]:
dataset

Unnamed: 0,College_ID,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Internship_Experience,Extra_Curricular_Score,Communication_Skills,Projects_Completed,Placement
0,29,107.0,6.61,6.28,8,No,8,8,4,No
1,60,97.0,5.52,5.37,8,No,7,8,0,No
2,35,109.0,5.36,5.83,9,No,3,1,1,No
3,54,122.0,5.47,5.75,6,Yes,1,6,1,No
4,3,96.0,7.91,7.69,7,No,8,10,2,No
...,...,...,...,...,...,...,...,...,...,...
9995,20,119.0,8.41,8.29,4,No,1,8,0,Yes
9996,97,70.0,9.25,9.34,7,No,0,7,2,No
9997,65,89.0,6.08,6.25,3,Yes,3,9,5,No
9998,44,107.0,8.77,8.92,3,No,7,5,1,No
