## Standard imports :

In [1]:
import pandas as pd
import numpy as np
import os

## Data Import :

In [2]:
data_path = os.path.join("..", "data/student_enrollment.csv")

In [3]:
df = pd.read_csv(data_path, encoding="latin1")

In [4]:
df.head()

Unnamed: 0,College_id,College_name,survey_year,faculty_name,department_name,levell,programme_id,programme,discipline_group_id,discipline_group,...,other_minority_general_females,other_minority_sc_total,other_minority_sc_females,other_minority_st_total,other_minority_st_females,other_minority_obc_total,other_minority_obc_females,other_minority_total_persons,other_minority_total_females,other_minority_remarks_id
0,15134,"SMT. BHUDEVI MAHAVIDYALAYA SHASTRI PURAM, AGRA",2015,,,Under Graduate,30,B.Sc.-Bachelor of Science,171,Science,...,,,,,,,,0.0,0.0,1.0
1,6133,Techno India 130,2015,,,Under Graduate,36,B.Tech.-Bachelor of Technology,26,Other Engineering & Technology,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,47253,"Shri D N Institute of Computer Applications, A...",2015,,,Under Graduate,46,B.C.A.-Bachelor of Computer Applications,183,Computer Application,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,6332,Techno India 309,2015,,BHM,Under Graduate,54,B.H.M.-Bachelor of Hotel Management,57,Hotel Management,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,6065,Sigma Institute of Physiotherapy,2015,,,Under Graduate,70,B.P.T.-Bachelor of Physiotherapy,164,Physiotherapy,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [5]:
len(df.columns)

58

In [33]:
len(df)

375105

## Data Preprocessing :

### Data cleaning  
     
   The techniques used in data cleaning are:
   * Replacing missing values with 0
   * Converting to same case to prevent duplicates with different case

In [24]:
df.isna().sum()

College_name                       0
levell                             0
programme                          0
discipline_group                   0
discipline                         0
type                               0
year                               0
total_general_total                0
total_general_females              0
total_sc_total                     0
total_sc_females                   0
total_st_total                     0
total_st_females                   0
total_obc_total                    0
total_obc_females                  0
total_total_persons                0
total_total_females                0
pwd_general_total                  0
pwd_general_females                0
pwd_sc_total                       0
pwd_sc_females                     0
pwd_st_total                       0
pwd_st_females                     0
pwd_obc_total                      0
pwd_obc_females                    0
pwd_total_persons                  0
pwd_total_females                  0
m

In [23]:
df.fillna(0,inplace=True)

In [9]:
df["discipline"] = df["discipline"].str.lower()

In [32]:
df = df[df.total_total_persons != 0]
df = df[df.College_name != 0]
df = df[df.levell != 0]
df = df[df.programme != 0]
df = df[df.discipline != 0]
df = df[df.discipline_group != 0]

### Data Reduction

Data reduction is brought about by using attribute selection.  
The unecessary columns are removed from the full set of attributes

In [10]:
df.columns

Index(['College_id', 'College_name', 'survey_year', 'faculty_name',
       'department_name', 'levell', 'programme_id', 'programme',
       'discipline_group_id', 'discipline_group', 'discipline', 'type', 'year',
       'id', 'total_general_total', 'total_general_females', 'total_sc_total',
       'total_sc_females', 'total_st_total', 'total_st_females',
       'total_obc_total', 'total_obc_females', 'total_total_persons',
       'total_total_females', 'total_remarks_id', 'pwd_general_total',
       'pwd_general_females', 'pwd_sc_total', 'pwd_sc_females', 'pwd_st_total',
       'pwd_st_females', 'pwd_obc_total', 'pwd_obc_females',
       'pwd_total_persons', 'pwd_total_females', 'pwd_remarks_id',
       'muslim_minority_general_total', 'muslim_minority_general_females',
       'muslim_minority_sc_total', 'muslim_minority_sc_females',
       'muslim_minority_st_total', 'muslim_minority_st_females',
       'muslim_minority_obc_total', 'muslim_minority_obc_females',
       'muslim_minorit

In [11]:
redundant_cols = ["College_id", "survey_year", "faculty_name", "department_name"]

In [12]:
for col in df.columns:
    split_col = col.split("_")
    if "remarks" in split_col or "id" in split_col:
        print(col)
        redundant_cols.append(col)

College_id
programme_id
discipline_group_id
id
total_remarks_id
pwd_remarks_id
muslim_minority_remarks_id
other_minority_remarks_id


In [13]:
df.drop(columns=redundant_cols, axis=1, inplace=True)

### Data transformation :

The data transformation techniques used are:
* Discretization
* Normalization

### Normalization :

In [14]:
all_cols = df.columns.tolist()

In [15]:
cols_to_remove = ["College_name", "levell", "programme", "discipline", "discipline_group",
                  "type", "year", "total_total_persons"
             ]

In [16]:
cols_to_normalize = [col for col in all_cols if col not in cols_to_remove]

In [17]:
cols_to_normalize

['total_general_total',
 'total_general_females',
 'total_sc_total',
 'total_sc_females',
 'total_st_total',
 'total_st_females',
 'total_obc_total',
 'total_obc_females',
 'total_total_females',
 'pwd_general_total',
 'pwd_general_females',
 'pwd_sc_total',
 'pwd_sc_females',
 'pwd_st_total',
 'pwd_st_females',
 'pwd_obc_total',
 'pwd_obc_females',
 'pwd_total_persons',
 'pwd_total_females',
 'muslim_minority_general_total',
 'muslim_minority_general_females',
 'muslim_minority_sc_total',
 'muslim_minority_sc_females',
 'muslim_minority_st_total',
 'muslim_minority_st_females',
 'muslim_minority_obc_total',
 'muslim_minority_obc_females',
 'muslim_minority_total_persons',
 'muslim_minority_total_females',
 'other_minority_general_total',
 'other_minority_general_females',
 'other_minority_sc_total',
 'other_minority_sc_females',
 'other_minority_st_total',
 'other_minority_st_females',
 'other_minority_obc_total',
 'other_minority_obc_females',
 'other_minority_total_persons',
 'other

In [20]:
count = 0
id_list = []
for idx, i in enumerate(df["total_total_persons"]):
    if i == 0:
        count += 1
        id_list.append(idx)
print(count)

64001


In [21]:
id_list

[6,
 13,
 20,
 21,
 24,
 38,
 40,
 42,
 60,
 74,
 78,
 79,
 88,
 91,
 97,
 98,
 112,
 114,
 123,
 151,
 155,
 158,
 161,
 165,
 173,
 176,
 182,
 185,
 189,
 200,
 205,
 213,
 217,
 226,
 227,
 232,
 273,
 278,
 285,
 293,
 294,
 295,
 300,
 304,
 306,
 310,
 313,
 318,
 322,
 337,
 342,
 343,
 345,
 360,
 362,
 376,
 384,
 387,
 392,
 397,
 423,
 428,
 435,
 446,
 451,
 453,
 462,
 464,
 486,
 487,
 491,
 496,
 500,
 502,
 508,
 520,
 531,
 544,
 550,
 559,
 573,
 598,
 609,
 637,
 660,
 678,
 684,
 695,
 699,
 700,
 709,
 721,
 740,
 745,
 747,
 752,
 768,
 775,
 785,
 790,
 806,
 814,
 816,
 818,
 832,
 833,
 836,
 847,
 853,
 868,
 871,
 872,
 873,
 878,
 884,
 895,
 904,
 914,
 915,
 927,
 934,
 939,
 946,
 950,
 956,
 969,
 970,
 987,
 992,
 1011,
 1014,
 1015,
 1017,
 1028,
 1030,
 1036,
 1046,
 1057,
 1076,
 1079,
 1090,
 1097,
 1105,
 1109,
 1117,
 1118,
 1130,
 1134,
 1158,
 1161,
 1167,
 1184,
 1201,
 1205,
 1223,
 1236,
 1245,
 1248,
 1261,
 1262,
 1263,
 1264,
 1271,
 1274

In [16]:
totals = df["total_total_persons"]
for col in cols_to_normalize:
    df[col] = df[col].divide(totals, axis="index")

In [17]:
df["discipline_group"].unique()

array(['Science', 'Other Engineering & Technology',
       'Computer Application', 'Hotel Management', 'Physiotherapy',
       'Civil Engineering', 'Information Technology',
       'Electrical Engineering', 'Public Health', 'Anesthesiology',
       'Mechanical Engineering', 'Physiology', 'Cultural Studies', 'Arts',
       'Commerce', 'Homeopathy', 'Pharmacy', 'Hindi', 'Economics',
       'Geography', 'Other Science', 'History', 'Nursing', 'Architecture',
       'Electronics Engineering', 'Computer Science',
       'Business Administration', 'Zoology', 'Law',
       'Computer Engineering', 'Area Studies', 'Education',
       'Physical Education', 'Sanskrit', 'English', 'Oriental Learning',
       'Agriculture Engineering', 'Sociology', 'Political Science',
       'Chemistry', 'Business Management', 'Bio-Technology',
       'Visual Arts', 'Other Indian Languages', 'Chemical Engineering',
       'Mathematics', 'Dentistry', 'Medical Science', 'Bengali',
       'Fashion Technology', 'Women 

### Aggregation :

In [18]:
df.columns

Index(['College_name', 'levell', 'programme', 'discipline_group', 'discipline',
       'type', 'year', 'total_general_total', 'total_general_females',
       'total_sc_total', 'total_sc_females', 'total_st_total',
       'total_st_females', 'total_obc_total', 'total_obc_females',
       'total_total_persons', 'total_total_females', 'pwd_general_total',
       'pwd_general_females', 'pwd_sc_total', 'pwd_sc_females', 'pwd_st_total',
       'pwd_st_females', 'pwd_obc_total', 'pwd_obc_females',
       'pwd_total_persons', 'pwd_total_females',
       'muslim_minority_general_total', 'muslim_minority_general_females',
       'muslim_minority_sc_total', 'muslim_minority_sc_females',
       'muslim_minority_st_total', 'muslim_minority_st_females',
       'muslim_minority_obc_total', 'muslim_minority_obc_females',
       'muslim_minority_total_persons', 'muslim_minority_total_females',
       'other_minority_general_total', 'other_minority_general_females',
       'other_minority_sc_total', 'ot

#### grouping SC, ST and OBC into one feature :

In [19]:
agg = []
for col in df.columns:
    split_col = col.split("_")
    if "sc" in split_col:
        l = [col]
        for st_col in df.columns:
            if st_col.split("_")[0] == split_col[0] and st_col.split("_")[::-1][0] == split_col[::-1][0]:
                if "st" in st_col.split("_") or "obc" in st_col.split("_"):
                    l.append(st_col)
        agg.append(l)

In [20]:
agg

[['total_sc_total', 'total_st_total', 'total_obc_total'],
 ['total_sc_females', 'total_st_females', 'total_obc_females'],
 ['pwd_sc_total', 'pwd_st_total', 'pwd_obc_total'],
 ['pwd_sc_females', 'pwd_st_females', 'pwd_obc_females'],
 ['muslim_minority_sc_total',
  'muslim_minority_st_total',
  'muslim_minority_obc_total'],
 ['muslim_minority_sc_females',
  'muslim_minority_st_females',
  'muslim_minority_obc_females'],
 ['other_minority_sc_total',
  'other_minority_st_total',
  'other_minority_obc_total'],
 ['other_minority_sc_females',
  'other_minority_st_females',
  'other_minority_obc_females']]

In [21]:
for triple in agg:
    df[triple[0]] = df[triple[0]] + df[triple[1]] + df[triple[2]]
    df.drop(triple[1], axis=1, inplace=True)
    df.drop(triple[2], axis=1, inplace=True)

#### Renaming the columns from 'sc' to 'backward_castes' :

In [22]:
cols = df.columns.tolist()
for idx, col in enumerate(cols):
    if "sc" in col.split("_"):
        cols[idx] = col.replace("_sc_", "_backward_castes_")

df.columns = cols

In [23]:
df.columns

Index(['College_name', 'levell', 'programme', 'discipline_group', 'discipline',
       'type', 'year', 'total_general_total', 'total_general_females',
       'total_backward_castes_total', 'total_backward_castes_females',
       'total_total_persons', 'total_total_females', 'pwd_general_total',
       'pwd_general_females', 'pwd_backward_castes_total',
       'pwd_backward_castes_females', 'pwd_total_persons', 'pwd_total_females',
       'muslim_minority_general_total', 'muslim_minority_general_females',
       'muslim_minority_backward_castes_total',
       'muslim_minority_backward_castes_females',
       'muslim_minority_total_persons', 'muslim_minority_total_females',
       'other_minority_general_total', 'other_minority_general_females',
       'other_minority_backward_castes_total',
       'other_minority_backward_castes_females',
       'other_minority_total_persons', 'other_minority_total_females'],
      dtype='object')

### Discretization :

In [34]:
def one_hot_enc(column_name, df):
    d = pd.get_dummies(df[column_name])
    df = pd.concat([d, df], axis=1)
    df.drop(column_name, axis=1, inplace=True)
    return df

In [35]:
df = one_hot_enc("levell", df)

### Saving the preprocessed data :

In [24]:
save_path = os.path.join("../data/preprocessed.csv")
df.to_csv(save_path, index=False)