### Import the necessary libraries and read the dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df_train= pd.read_csv('train.csv', index_col = 'id')
df_test = pd.read_csv('test.csv')

### Data Dictionary

| Variable Name              | Role    | Type       | Demographic         | Description                                                                                                                                                                                                                                                                                                                                                                                    |
|----------------------------|---------|------------|---------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Marital Status             | Feature | Integer    | Marital Status      | 1 – single, 2 – married, 3 – widower, 4 – divorced, 5 – facto union, 6 – legally separated                                                                                                                                                                                                                                                                                                    |
| Application mode           | Feature | Integer    |                     | 1 - 1st phase - general contingent, 2 - Ordinance No. 612/93, 5 - 1st phase - special contingent (Azores Island), 7 - Holders of other higher courses, 10 - Ordinance No. 854-B/99, 15 - International student (bachelor), 16 - 1st phase - special contingent (Madeira Island), 17 - 2nd phase - general contingent, 18 - 3rd phase - general contingent, 26 - Ordinance No. 533-A/99, item b2) (Different Plan), 27 - Ordinance No. 533-A/99, item b3 (Other Institution), 39 - Over 23 years old, 42 - Transfer, 43 - Change of course, 44 - Technological specialization diploma holders, 51 - Change of institution/course, 53 - Short cycle diploma holders, 57 - Change of institution/course (International) |
| Application order          | Feature | Integer    |                     | Application order (between 0 - first choice; and 9 - last choice)                                                                                                                                                                                                                                                                                                                               |
| Course                     | Feature | Integer    |                     | 33 - Biofuel Production Technologies, 171 - Animation and Multimedia Design, 8014 - Social Service (evening attendance), 9003 - Agronomy, 9070 - Communication Design, 9085 - Veterinary Nursing, 9119 - Informatics Engineering, 9130 - Equinculture, 9147 - Management, 9238 - Social Service, 9254 - Tourism, 9500 - Nursing, 9556 - Oral Hygiene, 9670 - Advertising and Marketing Management, 9773 - Journalism and Communication, 9853 - Basic Education, 9991 - Management (evening attendance) |
| Daytime/evening attendance | Feature | Integer    |                     | 1 – daytime, 0 - evening                                                                                                                                                                                                                                                                                                                                                                        |
| Previous qualification     | Feature | Integer    | Education Level     | 1 - Secondary education, 2 - Higher education - bachelor's degree, 3 - Higher education - degree, 4 - Higher education - master's, 5 - Higher education - doctorate, 6 - Frequency of higher education, 9 - 12th year of schooling - not completed, 10 - 11th year of schooling - not completed, 12 - Other - 11th year of schooling, 14 - 10th year of schooling, 15 - 10th year of schooling - not completed, 19 - Basic education 3rd cycle (9th/10th/11th year) or equiv., 38 - Basic education 2nd cycle (6th/7th/8th year) or equiv., 39 - Technological specialization course, 40 - Higher education - degree (1st cycle), 42 - Professional higher technical course, 43 - Higher education - master (2nd cycle) |
| Previous qualification (grade) | Feature | Continuous |                     | Grade of previous qualification (between 0 and 200)                                                                                                                                                                                                                                                                                                                                            |
| Nationality                | Feature | Integer    | Nationality         | 1 - Portuguese, 2 - German, 6 - Spanish, 11 - Italian, 13 - Dutch, 14 - English, 17 - Lithuanian, 21 - Angolan, 22 - Cape Verdean, 24 - Guinean, 25 - Mozambican, 26 - Santomean, 32 - Turkish, 41 - Brazilian, 62 - Romanian, 100 - Moldova (Republic of), 101 - Mexican, 103 - Ukrainian, 105 - Russian, 108 - Cuban, 109 - Colombian                                                                                       |
| Mother's qualification     | Feature | Integer    | Education Level     | 1 - Secondary Education - 12th Year of Schooling or Eq., 2 - Higher Education - Bachelor's Degree, 3 - Higher Education - Degree, 4 - Higher Education - Master's, 5 - Higher Education - Doctorate, 6 - Frequency of Higher Education, 9 - 12th Year of Schooling - Not Completed, 10 - 11th Year of Schooling - Not Completed, 11 - 7th Year (Old), 12 - Other - 11th Year of Schooling, 14 - 10th Year of Schooling, 18 - General commerce course, 19 - Basic Education 3rd Cycle (9th/10th/11th Year) or Equiv., 22 - Technical-professional course, 26 - 7th year of schooling, 27 - 2nd cycle of the general high school course, 29 - 9th Year of Schooling - Not Completed, 30 - 8th year of schooling, 34 - Unknown, 35 - Can't read or write, 36 - Can read without having a 4th year of schooling, 37 - Basic education 1st cycle (4th/5th year) or equiv., 38 - Basic Education 2nd Cycle (6th/7th/8th Year) or Equiv., 39 - Technological specialization course, 40 - Higher education - degree (1st cycle), 41 - Specialized higher studies course, 42 - Professional higher technical course, 43 - Higher Education - Master (2nd cycle), 44 - Higher Education - Doctorate (3rd cycle) |
| Father's qualification     | Feature | Integer    | Education Level     | 1 - Secondary Education - 12th Year of Schooling or Eq., 2 - Higher Education - Bachelor's Degree, 3 - Higher Education - Degree, 4 - Higher Education - Master's, 5 - Higher Education - Doctorate, 6 - Frequency of Higher Education, 9 - 12th Year of Schooling - Not Completed, 10 - 11th Year of Schooling - Not Completed, 11 - 7th Year (Old), 12 - Other - 11th Year of Schooling, 13 - 2nd year complementary high school course, 14 - 10th Year of Schooling, 18 - General commerce course, 19 - Basic Education 3rd Cycle (9th/10th/11th Year) or Equiv., 20 - Complementary High School Course, 22 - Technical-professional course, 25 - Complementary High School Course - not concluded, 26 - 7th year of schooling, 27 - 2nd cycle of the general high school course, 29 - 9th Year of Schooling - Not Completed, 30 - 8th year of schooling, 31 - General Course of Administration and Commerce, 33 - Supplementary Accounting and Administration, 34 - Unknown, 35 - Can't read or write, 36 - Can read without having a 4th year of schooling, 37 - Basic education 1st cycle (4th/5th year) or equiv., 38 - Basic Education 2nd Cycle (6th/7th/8th Year) or Equiv., 39 - Technological specialization course, 40 - Higher education - degree (1st cycle), 41 - Specialized higher studies course, 42 - Professional higher technical course, 43 - Higher Education - Master (2nd cycle), 44 - Higher Education - Doctorate (3rd cycle) |
| Mother's occupation        | Feature | Integer    | Occupation          | 0 - Student, 1 - Representatives of the Legislative Power and Executive Bodies, Directors, Directors and Executive Managers, 2 - Specialists in Intellectual and Scientific Activities, 3 - Intermediate Level Technicians and Professions, 4 - Administrative staff, 5 - Personal Services, Security and Safety Workers and Sellers, 6 - Farmers and Skilled Workers in Agriculture, Fisheries and Forestry, 7 - Skilled Workers in Industry, Construction and Craftsmen, 8 - Installation and Machine Operators and Assembly Workers, 9 - Unskilled Workers, 10 - Armed Forces Professions, 90 - Other Situation, 99 - (blank), 122 - Health professionals, 123 - Teachers, 125 - Specialists in information and communication technologies (ICT), 131 - Intermediate level science and engineering technicians and professions, 132 - Technicians and professionals, of intermediate level of health, 134 - Intermediate level technicians from legal, social, sports, cultural and similar services, 141 - Office workers, secretaries in general and data processing operators, 143 - Data, accounting, statistical, financial services and registry-related operators, 144 - Other administrative support staff, 151 - Personal service workers, 152 - Sellers, 153 - Personal care workers and the like, 171 - Skilled construction workers and the like, except electricians, 173 - Skilled workers in printing, precision instrument manufacturing, jewelers, artisans and the like, 175 - Workers in food processing, woodworking, clothing and other industries and crafts, 191 - Cleaning workers, 192 - Unskilled workers in agriculture, animal production, fisheries and forestry, 193 - Unskilled workers in extractive industry, construction, manufacturing and transport, 194 - Meal preparation assistants |
| Father's occupation        | Feature | Integer    | Occupation          | 0 - Student, 1 - Representatives of the Legislative Power and Executive Bodies, Directors, Directors and Executive Managers, 2 - Specialists in Intellectual and Scientific Activities, 3 - Intermediate Level Technicians and Professions, 4 - Administrative staff, 5 - Personal Services, Security and Safety Workers and Sellers, 6 - Farmers and Skilled Workers in Agriculture, Fisheries and Forestry, 7 - Skilled Workers in Industry, Construction and Craftsmen, 8 - Installation and Machine Operators and Assembly Workers, 9 - Unskilled Workers, 10 - Armed Forces Professions, 90 - Other Situation, 99 - (blank), 122 - Health professionals, 123 - Teachers, 125 - Specialists in information and communication technologies (ICT), 131 - Intermediate level science and engineering technicians and professions, 132 - Technicians and professionals, of intermediate level of health, 134 - Intermediate level technicians from legal, social, sports, cultural and similar services, 141 - Office workers, secretaries in general and data processing operators, 143 - Data, accounting, statistical, financial services and registry-related operators, 144 - Other administrative support staff, 151 - Personal service workers, 152 - Sellers, 153 - Personal care workers and the like, 171 - Skilled construction workers and the like, except electricians, 173 - Skilled workers in printing, precision instrument manufacturing, jewelers, artisans and the like, 175 - Workers in food processing, woodworking, clothing and other industries and crafts, 191 - Cleaning workers, 192 - Unskilled workers in agriculture, animal production, fisheries and forestry, 193 - Unskilled workers in extractive industry, construction, manufacturing and transport, 194 - Meal preparation assistants |
| Displaced                 | Feature | Integer    |                     | 1 – yes, 0 - no                                                                                                                                                                                                                                                                                                                                                                                 |
| Educational special needs | Feature | Integer    |                     | 1 – yes, 0 - no                                                                                                                                                                                                                                                                                                                                                                                 |
| Debtor                    | Feature | Integer    |                     | 1 – yes, 0 - no                                                                                                                                                                                                                                                                                                                                                                                 |
| Tuition fees up to date   | Feature | Integer    |                     | 1 – yes, 0 - no                                                                                                                                                                                                                                                                                                                                                                                 |
| Gender                    | Feature | Integer    | Gender              | 1 – female, 0 - male                                                                                                                                                                                                                                                                                                                                                                            |
| Scholarship holder        | Feature | Integer    |                     | 1 – yes, 0 - no                                                                                                                                                                                                                                                                                                                                                                                 |
| Age at enrollment         | Feature | Integer    | Age                 | Age at enrollment (years)                                                                                                                                                                                                                                                                                                                                                                       |
| International             | Feature | Integer    |                     | 1 – yes, 0 - no                                                                                                                                                                                                                                                                                                                                                                                 |
| Curricular units 1st sem (credited) | Feature | Integer |                     | Number of curricular units credited in the 1st semester                                                                                                                                                                                                                                                                                                                                           |
| Curricular units 1st sem (enrolled) | Feature | Integer |                     | Number of curricular units enrolled in the 1st semester                                                                                                                                                                                                                                                                                                                                           |
| Curricular units 1st sem (evaluations) | Feature | Integer |                     | Number of curricular units evaluated in the 1st semester                                                                                                                                                                                                                                                                                                                                          |
| Curricular units 1st sem (approved) | Feature | Integer |                     | Number of curricular units approved in the 1st semester                                                                                                                                                                                                                                                                                                                                           |
| Curricular units 1st sem (grade) | Feature | Continuous |                     | Grade of curricular units in the 1st semester                                                                                                                                                                                                                                                                                                                                                   |
| Curricular units 1st sem (without evaluations) | Feature | Integer |                     | Number of curricular units in the 1st semester without evaluations                                                                                                                                                                                                                                                                                                                                |
| Curricular units 2nd sem (credited) | Feature | Integer |                     | Number of curricular units credited in the 2nd semester                                                                                                                                                                                                                                                                                                                                           |
| Curricular units 2nd sem (enrolled) | Feature | Integer |                     | Number of curricular units enrolled in the 2nd semester                                                                                                                                                                                                                                                                                                                                           |
| Curricular units 2nd sem (evaluations) | Feature | Integer |                     | Number of curricular units evaluated in the 2nd semester                                                                                                                                                                                                                                                                                                                                          |
| Curricular units 2nd sem (approved) | Feature | Integer |                     | Number of curricular units approved in the 2nd semester                                                                                                                                                                                                                                                                                                                                           |
| Curricular units 2nd sem (grade) | Feature | Continuous |                     | Grade of curricular units in the 2nd semester                                                                                                                                                                                                                                                                                                                                                   |
| Curricular units 2nd sem (without evaluations) | Feature | Integer |                     | Number of curricular units in the 2nd semester without evaluations                                                                                                                                                                                                                                                                                                                                |
| Unemployment rate         | Feature | Continuous |                     | Unemployment rate (%) in the student's home country                                                                                                                                                                                                                                                                                                                                              |
| Inflation rate            | Feature | Continuous |                     | Inflation rate (%) in the student's home country                                                                                                                                                                                                                                                                                                                                                 |
| GDP                       | Feature | Continuous |                     | Gross Domestic Product (in billions) of the student's home country                                                                                                                                                                                                                                                                                                                                |
| Target                    | Target  | Integer    |                     | 1 – dropout, 0 - enrolled                                                                                                                                                                                                                                                                                                                                                                         |


In [3]:
df_train.head()

Unnamed: 0_level_0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,9238,1,1,126.0,1,1,19,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,17,1,9238,1,1,125.0,1,19,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,1,17,2,9254,1,1,137.0,1,3,19,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,1,1,3,9500,1,1,131.0,1,19,3,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,1,1,2,9500,1,1,132.0,1,19,37,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


In [4]:
df_test.head()

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,76518,1,1,1,9500,1,1,141.0,1,3,...,0,0,8,0,0,0.0,0,13.9,-0.3,0.79
1,76519,1,1,1,9238,1,1,128.0,1,1,...,0,0,6,6,6,13.5,0,11.1,0.6,2.02
2,76520,1,1,1,9238,1,1,118.0,1,1,...,0,0,6,11,5,11.0,0,15.5,2.8,-4.06
3,76521,1,44,1,9147,1,39,130.0,1,1,...,0,3,8,14,5,11.0,0,8.9,1.4,3.51
4,76522,1,39,1,9670,1,1,110.0,1,1,...,0,0,6,9,4,10.666667,2,7.6,2.6,0.32


### Understanding the dataset

In [7]:
df_train.shape

(76518, 37)

In [13]:
#show entire dataframe
pd.set_option('display.max_columns', None)

df_train.describe()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
count,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0
mean,1.111934,16.054419,1.64441,9001.286377,0.915314,3.65876,132.378766,1.2266,19.837633,23.425076,8.583196,8.882172,125.363971,0.569265,0.003738,0.071382,0.893646,0.315821,0.247393,22.278653,0.006626,0.188871,5.891516,7.352362,4.17852,9.995862,0.05796,0.137053,5.933414,7.234468,4.007201,9.626085,0.062443,11.52034,1.228218,-0.080921
std,0.441669,16.682337,1.229645,1803.438531,0.278416,8.623774,10.995328,3.392183,15.399456,14.921164,17.471591,16.80394,12.562328,0.495182,0.061023,0.257463,0.308292,0.464845,0.4315,6.889241,0.08113,1.175296,1.671776,3.508292,2.687995,5.264224,0.40849,0.93383,1.627182,3.50304,2.772956,5.546035,0.462107,2.653375,1.398816,2.251382
min,1.0,1.0,0.0,33.0,0.0,1.0,95.0,1.0,1.0,1.0,0.0,0.0,95.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.6,-0.8,-4.06
25%,1.0,1.0,1.0,9119.0,1.0,1.0,125.0,1.0,1.0,4.0,4.0,5.0,118.0,0.0,0.0,0.0,1.0,0.0,0.0,18.0,0.0,0.0,5.0,6.0,2.0,10.666667,0.0,0.0,5.0,6.0,1.0,10.0,0.0,9.4,0.3,-1.7
50%,1.0,17.0,1.0,9254.0,1.0,1.0,133.1,1.0,19.0,19.0,7.0,7.0,124.6,1.0,0.0,0.0,1.0,0.0,0.0,19.0,0.0,0.0,6.0,7.0,5.0,12.166667,0.0,0.0,6.0,7.0,5.0,12.142857,0.0,11.1,1.4,0.32
75%,1.0,39.0,2.0,9670.0,1.0,1.0,140.0,1.0,37.0,37.0,9.0,9.0,132.0,1.0,0.0,0.0,1.0,1.0,0.0,23.0,0.0,0.0,6.0,9.0,6.0,13.314286,0.0,0.0,6.0,9.0,6.0,13.244048,0.0,12.7,2.6,1.79
max,6.0,53.0,9.0,9991.0,1.0,43.0,190.0,109.0,44.0,44.0,194.0,195.0,190.0,1.0,1.0,1.0,1.0,1.0,1.0,70.0,1.0,20.0,26.0,45.0,26.0,18.875,12.0,19.0,23.0,33.0,20.0,18.0,12.0,16.2,3.7,3.51


In [12]:
df_train['Target'].value_counts()
# The Dataset is imbalanced

Target
Graduate    36282
Dropout     25296
Enrolled    14940
Name: count, dtype: int64

In [15]:
df_train.isna().sum()

# No null values present

Marital status                                    0
Application mode                                  0
Application order                                 0
Course                                            0
Daytime/evening attendance                        0
Previous qualification                            0
Previous qualification (grade)                    0
Nacionality                                       0
Mother's qualification                            0
Father's qualification                            0
Mother's occupation                               0
Father's occupation                               0
Admission grade                                   0
Displaced                                         0
Educational special needs                         0
Debtor                                            0
Tuition fees up to date                           0
Gender                                            0
Scholarship holder                                0
Age at enrol

In [16]:
df_train.duplicated().sum()

# No duplicate values present

0