In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
raw_data = pd.read_csv('placement.csv')
#To convert the first row into header:
new_header = raw_data.iloc[0]
raw_data = raw_data[1:]
raw_data.columns = new_header
print(raw_data.head())
print(raw_data.columns)

0                                      Branch    Degree Aptitude English  \
1     Applied Electronics and Instrumentation  B. Tech.     69 %    87 %   
2                      Mechanical Engineering  B. Tech.     76 %    80 %   
3  Electronics and Communications Engineering  B. Tech.     71 %    73 %   
4     Applied Electronics and Instrumentation  B. Tech.     71 %    93 %   
5  Electronics and Communications Engineering  B. Tech.     64 %    73 %   

0 Quantitative Analytical Domain Computer Fundamental Coding Placement Status  
1         53 %       67 %   30 %                 20 %   60 %           Placed  
2         73 %       73 %   55 %                 40 %   00 %              NaN  
3         73 %       67 %   55 %                 47 %   20 %              NaN  
4         40 %       80 %   50 %                 33 %   20 %              NaN  
5         47 %       73 %   55 %                 53 %   20 %              NaN  
Index(['Branch', 'Degree', 'Aptitude', 'English', 'Quantitative

In [7]:
#Checking the dimensions of the dataset:
print(f'No. of rows: {raw_data.shape[0]}')
print(f'No. of columns: {raw_data.shape[1]}')

No. of rows: 12864
No. of columns: 10


In [8]:
#Checking the number of null values in the dataset 
for col in raw_data.columns:
  null_count = raw_data[col].isna().sum()
  print(f'{col} : {null_count}')

Branch : 0
Degree : 0
Aptitude : 0
English : 0
Quantitative : 0
Analytical : 0
Domain : 0
Computer Fundamental : 0
Coding : 0
Placement Status : 12097


In [9]:
raw_data['Placement Status'].fillna('Not Placed',inplace=True)

In [10]:
#Numerical encoding for target variable:
print(raw_data['Placement Status'].value_counts())
raw_data_1 = pd.get_dummies(raw_data,columns=['Placement Status'],drop_first=True)
print(raw_data_1['Placement Status_Placed'].value_counts())
print()
print(raw_data_1.info())

Not Placed    12097
Placed          767
Name: Placement Status, dtype: int64
0    12097
1      767
Name: Placement Status_Placed, dtype: int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12864 entries, 1 to 12864
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Branch                   12864 non-null  object
 1   Degree                   12864 non-null  object
 2   Aptitude                 12864 non-null  object
 3   English                  12864 non-null  object
 4   Quantitative             12864 non-null  object
 5   Analytical               12864 non-null  object
 6   Domain                   12864 non-null  object
 7   Computer Fundamental     12864 non-null  object
 8   Coding                   12864 non-null  object
 9   Placement Status_Placed  12864 non-null  uint8 
dtypes: object(9), uint8(1)
memory usage: 917.2+ KB
None


In [11]:
print(raw_data_1["Placement Status_Placed"].value_counts())

0    12097
1      767
Name: Placement Status_Placed, dtype: int64


In [12]:
#Tabling the Branch and Placement Status
# print(raw_data_1.groupby('Branch')['Placement Status_Placed'].value_counts())
print(pd.crosstab(raw_data_1['Branch'],raw_data_1['Placement Status_Placed']))

Placement Status_Placed                              0    1
Branch                                                     
Agricultural Engineering                            14    1
Applied Electronics and Instrumentation            165   22
Automobile Engineering                              15    2
Civil Engineering                                  942   10
Computer Application                                 1    0
Computer Science And Information Technology        204    0
Computer Science Engineering                      2279  263
Electrical & Communications Engineering              1    0
Electrical Engineering                            2247   90
Electrical and Electronics Engineering             676   26
Electronic Instrumentation & Control Engineering    32    0
Electronics & TeleCommunication Engineering        110    0
Electronics and Communications Engineering        2506  207
Electronics and Instrumentation Engineering        235    5
Information Technology                  

In [13]:
# print(raw_data_1.loc[(raw_data_1['Branch'] == 'Information Technology') & (raw_data_1['Placement Status_Placed'] == 0)].sample(n=10))

                       Branch    Degree Aptitude English Quantitative  \
11690  Information Technology  B. Tech.     58 %    40 %         53 %   
7743   Information Technology  B. Tech.     31 %    27 %         47 %   
11293  Information Technology  B. Tech.     47 %    67 %         40 %   
2824   Information Technology  B. Tech.     47 %    47 %         60 %   
9935   Information Technology  B. Tech.     42 %    60 %         00 %   
11271  Information Technology  B. Tech.     71 %    67 %         60 %   
2631   Information Technology  B. Tech.     67 %    73 %         60 %   
9281   Information Technology  B. Tech.     44 %    33 %         33 %   
4606   Information Technology  B. Tech.     53 %    40 %         73 %   
9922   Information Technology  B. Tech.     47 %    87 %         07 %   

      Analytical Domain Computer Fundamental Coding  Placement Status_Placed  
11690       80 %   20 %                 20 %   00 %                        0  
7743        20 %   25 %               

In [14]:
#Data structure to filter the data:
helper = {
    'Agricultural Engineering' : 2,
    "Applied Electronics and Instrumentation" : 30,
    'Automobile Engineering' : 3,
    'Civil Engineering' : 15,
    'Computer Science Engineering' : 300,
    'Electrical Engineering' : 110,
    'Electrical and Electronics Engineering' : 35,
    'Electronics and Communications Engineering' : 230,
    'Electronics and Instrumentation Engineering' : 7,
    'Information Technology' : 65,
    'Instrumentation and Control Engineering' : 5,
    'MCA' : 8,
    'Mechanical Engineering' : 95
}


In [30]:
raw_data_1.rename(columns={'Placement Status_Placed':'placed'},inplace=True)
new_data = pd.DataFrame(columns=raw_data_1.columns)

for branch in list(raw_data['Branch'].unique()):
  if branch in helper.keys():
    # print(f'came in for : {branch}')
    #Pushing the branch rows which are placed:
    temp = raw_data_1.loc[(raw_data_1['Branch'] == branch) & (raw_data_1['placed'] == 1)]
    new_data = new_data.append(temp)
    #Pushing the branch rows that are not placed:
    temp = raw_data_1.loc[(raw_data_1['Branch'] == branch) & (raw_data_1['placed'] == 0)].sample(n=helper[branch])
    new_data = new_data.append(temp)

print(pd.crosstab(new_data['Branch'],new_data['placed']))
# print(raw_data_1.head())
# print(new_data.head())
    

placed                                         0    1
Branch                                               
Agricultural Engineering                       2    1
Applied Electronics and Instrumentation       30   22
Automobile Engineering                         3    2
Civil Engineering                             15   10
Computer Science Engineering                 300  263
Electrical Engineering                       110   90
Electrical and Electronics Engineering        35   26
Electronics and Communications Engineering   230  207
Electronics and Instrumentation Engineering    7    5
Information Technology                        65   53
Instrumentation and Control Engineering        5    4
MCA                                            8    6
Mechanical Engineering                        95   78


In [34]:
new_data_shuffled = new_data.sample(frac = 1)
new_data_shuffled.to_csv('balanced.csv',index=False)

In [35]:
from google.colab import files
files.download("balanced.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [36]:
# Checking the downloaded file:
test = pd.read_csv('balanced.csv')
print(pd.crosstab(test['Branch'],test['placed']))
print(test.shape)

placed                                         0    1
Branch                                               
Agricultural Engineering                       2    1
Applied Electronics and Instrumentation       30   22
Automobile Engineering                         3    2
Civil Engineering                             15   10
Computer Science Engineering                 300  263
Electrical Engineering                       110   90
Electrical and Electronics Engineering        35   26
Electronics and Communications Engineering   230  207
Electronics and Instrumentation Engineering    7    5
Information Technology                        65   53
Instrumentation and Control Engineering        5    4
MCA                                            8    6
Mechanical Engineering                        95   78
(1672, 10)
