## Unsupervised Machine Learning: Final Project

### Import Libraries

In [161]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder,OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, GridSearchCV
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture


### Load Data and Explore

In [162]:
# Load dataset

# from google.colab import drive

# drive.mount('/content/drive',force_remount=True)

students_data = pd.read_csv('/content/drive/My Drive/Machine_Learning/Coursera/IBM_Machine_Learning/Unsupervised_Machine_Learning/Final_Project/students_dropout_and_success.csv')
students_data.head()

Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [163]:
# Display basic dataset information
students_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital Status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance                      4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Previous qualification (grade)                  4424 non-null   float64
 7   Nacionality                                     4424 non-null   int64  
 8   Mother's qualification                          4424 non-null   int64  
 9   Father's qualification                   

In [164]:
# Generate descriptive statistics for the dataset and transpose the result for better readability
students_data.describe().T  # Transpose the summary statistics to display features as rows

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Marital Status,4424.0,1.178571,0.605747,1.0,1.0,1.0,1.0,6.0
Application mode,4424.0,18.669078,17.484682,1.0,1.0,17.0,39.0,57.0
Application order,4424.0,1.727848,1.313793,0.0,1.0,1.0,2.0,9.0
Course,4424.0,8856.642631,2063.566416,33.0,9085.0,9238.0,9556.0,9991.0
Daytime/evening attendance,4424.0,0.890823,0.311897,0.0,1.0,1.0,1.0,1.0
Previous qualification,4424.0,4.577758,10.216592,1.0,1.0,1.0,1.0,43.0
Previous qualification (grade),4424.0,132.613314,13.188332,95.0,125.0,133.1,140.0,190.0
Nacionality,4424.0,1.873192,6.914514,1.0,1.0,1.0,1.0,109.0
Mother's qualification,4424.0,19.561935,15.603186,1.0,2.0,19.0,37.0,44.0
Father's qualification,4424.0,22.275316,15.343108,1.0,3.0,19.0,37.0,44.0


### Exploratary Data Analysis and Feature Engineering

In [165]:
# Check Target Column. We will drop this column later. We will try to see whether we can get similar propotion with 3 cluster.
students_data.Target.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Target,Unnamed: 1_level_1
Graduate,0.499322
Dropout,0.321203
Enrolled,0.179476


In [166]:
feature_columns = students_data.select_dtypes(exclude=['object']).columns.tolist()
students_data[feature_columns].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 36 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital Status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance                      4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Previous qualification (grade)                  4424 non-null   float64
 7   Nacionality                                     4424 non-null   int64  
 8   Mother's qualification                          4424 non-null   int64  
 9   Father's qualification                   

In [167]:
# The correlation matrix
corr_mat = students_data[feature_columns].corr()

# Strip out the diagonal values for the next step
for x in range(len(students_data[feature_columns].columns)):
    corr_mat.iloc[x,x] = 0.0

corr_mat



Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
Marital Status,0.0,0.264006,-0.125854,0.046365,-0.274939,0.062529,-0.022406,-0.008843,0.193163,0.130353,...,0.034711,0.062831,0.039026,0.022784,-0.043739,-0.071506,0.020426,-0.020338,0.008761,-0.027003
Application mode,0.264006,0.0,-0.286357,0.065385,-0.304092,0.422411,-0.03902,-0.000661,0.118974,0.083276,...,0.045828,0.238445,0.130046,0.167872,-0.071526,-0.115424,0.047983,0.08908,-0.016375,-0.022743
Application order,-0.125854,-0.286357,0.0,0.059507,0.158657,-0.184315,-0.064484,-0.022416,-0.064956,-0.050288,...,-0.031699,-0.125815,0.028878,-0.055089,0.071793,0.055517,-0.015757,-0.098419,-0.011133,0.030201
Course,0.046365,0.065385,0.059507,0.0,-0.043151,0.006654,-0.081013,-0.033923,0.054543,0.050724,...,0.034514,-0.089817,0.401539,0.278797,0.198032,0.348728,0.030816,0.007153,0.01771,-0.020265
Daytime/evening attendance,-0.274939,-0.304092,0.158657,-0.043151,0.0,-0.071871,0.052597,0.01853,-0.204767,-0.139894,...,0.04563,-0.111953,0.000371,0.01461,0.034022,0.050493,-0.004229,0.061974,-0.024043,0.022929
Previous qualification,0.062529,0.422411,-0.184315,0.006654,-0.071871,0.0,0.104072,-0.029214,-0.01319,-0.006614,...,0.002887,0.143031,0.056179,0.11485,-0.008632,0.000942,0.005102,0.111958,-0.063736,0.064069
Previous qualification (grade),-0.022406,-0.03902,-0.064484,-0.081013,0.052597,0.104072,0.0,0.054088,-0.06067,-0.035234,...,-0.003926,-0.018489,-0.031649,-0.061355,0.050263,0.053239,-0.019015,0.045222,0.01871,-0.05262
Nacionality,-0.008843,-0.000661,-0.022416,-0.033923,0.01853,-0.029214,0.054088,0.0,-0.049946,-0.085282,...,0.009145,-0.007278,-0.020113,-0.025721,-0.01788,-0.008497,-0.014041,-0.000651,-0.008922,0.034478
Mother's qualification,0.193163,0.118974,-0.064956,0.054543,-0.204767,-0.01319,-0.06067,-0.049946,0.0,0.53514,...,0.003183,0.042771,0.03515,0.021033,-0.014858,-0.031175,0.021305,-0.114351,0.059441,-0.083657
Father's qualification,0.130353,0.083276,-0.050288,0.050724,-0.139894,-0.006614,-0.035234,-0.085282,0.53514,0.0,...,-0.017333,0.042666,0.02438,0.009514,0.005285,-0.008083,-0.00743,-0.077905,0.057633,-0.07161


In [168]:
# Pairwise maximal correlations
corr_mat.abs().max().sort_values()

Unnamed: 0,0
Educational special needs,0.046131
Inflation rate,0.112295
Scholarship holder,0.202704
Gender,0.224266
Application order,0.332362
Unemployment rate,0.335178
GDP,0.335178
Displaced,0.362032
Course,0.401539
Debtor,0.408454


Check skew values in anticipation of transformations.

In [169]:
skew_columns = (students_data[feature_columns].skew().sort_values(ascending=False))

skew_columns = skew_columns.loc[skew_columns > 0.75]
skew_columns

Unnamed: 0,0
Nacionality,10.703998
Educational special needs,9.154976
Curricular units 1st sem (without evaluations),8.207403
Curricular units 2nd sem (without evaluations),7.267701
International,6.10483
Father's occupation,5.395173
Mother's occupation,5.339227
Curricular units 2nd sem (credited),4.63482
Marital Status,4.399764
Curricular units 1st sem (credited),4.169049


In [170]:
# Perform log transform on skewed columns
for col in skew_columns.index.tolist():
    students_data[col] = np.log1p(students_data[col])



In [171]:
# Perform standard scaler
scaler = StandardScaler()
students_data[feature_columns]= scaler.fit_transform(students_data[feature_columns])

students_data[feature_columns]

Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,-0.323463,-0.095470,2.322256,-4.209520,0.350082,-0.395901,-0.804841,-0.153037,-0.036018,-0.669778,...,-0.245125,-0.339689,-4.325593,-2.042630,-1.471527,-1.963489,-0.238476,-0.287638,0.124386,0.765761
1,-0.323463,-0.209869,-0.606025,0.192580,0.350082,-0.395901,2.076819,-0.153037,-1.189759,-1.256427,...,-0.245125,-0.339689,0.080493,-0.522682,0.518904,0.659562,-0.238476,0.876222,-1.105222,0.347199
2,-0.323463,-1.010660,2.322256,0.103404,0.350082,-0.395901,-0.804841,-0.153037,1.117723,0.959802,...,-0.245125,-0.339689,0.080493,-2.042630,-1.471527,-1.963489,-0.238476,-0.287638,0.124386,0.765761
3,-0.323463,-0.095470,0.474716,0.444115,0.350082,-0.395901,-0.804841,-0.153037,1.181819,0.959802,...,-0.245125,-0.339689,0.080493,0.490616,0.187165,0.416450,-0.238476,-0.813253,-1.466871,-1.375511
4,1.787906,1.162916,-0.606025,-0.408389,-2.856470,-0.395901,-2.473171,-0.153037,1.117723,1.024985,...,-0.245125,-0.339689,0.080493,-0.522682,0.518904,0.531608,-0.238476,0.876222,-1.105222,0.347199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,-0.323463,-1.010660,2.733134,0.444115,0.350082,-0.395901,-0.577342,-0.153037,-1.189759,-1.386793,...,-0.245125,-0.339689,0.080493,-0.016033,0.187165,0.467631,-0.238476,1.476924,1.137005,-1.789667
4420,-0.323463,-1.010660,0.474716,0.444115,0.350082,-0.395901,-0.956508,9.046312,-1.189759,-1.386793,...,-0.245125,-0.339689,0.080493,-0.522682,-0.808050,0.147747,-0.238476,-0.175007,-0.454253,0.889126
4421,-0.323463,-1.010660,-0.606025,0.311805,0.350082,-0.395901,1.621820,-0.153037,1.117723,0.959802,...,-0.245125,-0.339689,0.649540,0.237291,-1.139788,0.627573,-0.238476,0.876222,-1.105222,0.347199
4422,-0.323463,-1.010660,-0.606025,0.140722,0.350082,-0.395901,3.593483,-0.153037,1.117723,0.959802,...,-0.245125,-0.339689,-0.268547,-0.522682,0.187165,0.339678,-0.238476,-0.813253,-1.466871,-1.375511


In [172]:
students_data[feature_columns].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Marital Status,4424.0,-3.244341e-16,1.000113,-0.323463,-0.323463,-0.323463,-0.323463,6.20002
Application mode,4424.0,-1.477621e-16,1.000113,-1.01066,-1.01066,-0.09547,1.162916,2.192505
Application order,4424.0,-1.991575e-16,1.000113,-2.453564,-0.606025,-0.606025,0.474716,3.683829
Course,4424.0,2.288706e-16,1.000113,-4.276402,0.110674,0.184826,0.338945,0.549769
Daytime/evening attendance,4424.0,7.066881e-17,1.000113,-2.85647,0.350082,0.350082,0.350082,0.350082
Previous qualification,4424.0,-8.753295000000001e-17,1.000113,-0.395901,-0.395901,-0.395901,-0.395901,3.120153
Previous qualification (grade),4424.0,-3.589654e-16,1.000113,-2.852337,-0.577342,0.036907,0.560156,4.351815
Nacionality,4424.0,-2.778569e-16,1.000113,-0.153037,-0.153037,-0.153037,-0.153037,9.132138
Mother's qualification,4424.0,-5.781993000000001e-17,1.000113,-1.189759,-1.125662,-0.036018,1.117723,1.5664
Father's qualification,4424.0,0.0,1.000113,-1.386793,-1.256427,-0.213496,0.959802,1.416085


So far what we have done are:


*   We have removed Target columns
*   Converted all the feature columns to float columns
*   Columns with higher skew(>0.75) have been log tranform
*   And all the columns have been rescaled to standard scaller

Now we are ready to perform cluster. Intention to do 3 cluster from this dataset.





### Model Clustering

### 1. KMeans

In [173]:
km = KMeans(n_clusters=3, random_state=42)
km = km.fit(students_data[feature_columns])

students_data['kmeans_cluster'] = km.predict(students_data[feature_columns])



### 2. GMM

In [174]:
gmm = GaussianMixture(n_components=3, random_state=42)
gmm = gmm.fit(students_data[feature_columns])

students_data['gmm_cluster'] = gmm.predict(students_data[feature_columns])

### 3. Agglomerative Clustering

In [175]:
ag = AgglomerativeClustering(n_clusters=3)
ag = ag.fit(students_data[feature_columns])

students_data['agglomerative_cluster'] = ag.fit_predict(students_data[feature_columns])

In [176]:
students_data

Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,kmeans_cluster,gmm_cluster,agglomerative_cluster
0,-0.323463,-0.095470,2.322256,-4.209520,0.350082,-0.395901,-0.804841,-0.153037,-0.036018,-0.669778,...,-1.471527,-1.963489,-0.238476,-0.287638,0.124386,0.765761,Dropout,1,1,1
1,-0.323463,-0.209869,-0.606025,0.192580,0.350082,-0.395901,2.076819,-0.153037,-1.189759,-1.256427,...,0.518904,0.659562,-0.238476,0.876222,-1.105222,0.347199,Graduate,0,2,0
2,-0.323463,-1.010660,2.322256,0.103404,0.350082,-0.395901,-0.804841,-0.153037,1.117723,0.959802,...,-1.471527,-1.963489,-0.238476,-0.287638,0.124386,0.765761,Dropout,2,2,1
3,-0.323463,-0.095470,0.474716,0.444115,0.350082,-0.395901,-0.804841,-0.153037,1.181819,0.959802,...,0.187165,0.416450,-0.238476,-0.813253,-1.466871,-1.375511,Graduate,0,2,0
4,1.787906,1.162916,-0.606025,-0.408389,-2.856470,-0.395901,-2.473171,-0.153037,1.117723,1.024985,...,0.518904,0.531608,-0.238476,0.876222,-1.105222,0.347199,Graduate,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,-0.323463,-1.010660,2.733134,0.444115,0.350082,-0.395901,-0.577342,-0.153037,-1.189759,-1.386793,...,0.187165,0.467631,-0.238476,1.476924,1.137005,-1.789667,Graduate,0,2,0
4420,-0.323463,-1.010660,0.474716,0.444115,0.350082,-0.395901,-0.956508,9.046312,-1.189759,-1.386793,...,-0.808050,0.147747,-0.238476,-0.175007,-0.454253,0.889126,Dropout,0,0,0
4421,-0.323463,-1.010660,-0.606025,0.311805,0.350082,-0.395901,1.621820,-0.153037,1.117723,0.959802,...,-1.139788,0.627573,-0.238476,0.876222,-1.105222,0.347199,Dropout,0,2,0
4422,-0.323463,-1.010660,-0.606025,0.140722,0.350082,-0.395901,3.593483,-0.153037,1.117723,0.959802,...,0.187165,0.339678,-0.238476,-0.813253,-1.466871,-1.375511,Graduate,0,2,0


### Model Result

In [182]:
students_data.Target.value_counts()

Unnamed: 0_level_0,count
Target,Unnamed: 1_level_1
Graduate,2209
Dropout,1421
Enrolled,794


In [181]:
students_data.kmeans_cluster.value_counts()

Unnamed: 0_level_0,count
kmeans_cluster,Unnamed: 1_level_1
0,3503
2,741
1,180


In [183]:
students_data.gmm_cluster.value_counts()

Unnamed: 0_level_0,count
gmm_cluster,Unnamed: 1_level_1
2,3496
0,748
1,180


In [184]:
students_data.agglomerative_cluster.value_counts()

Unnamed: 0_level_0,count
agglomerative_cluster,Unnamed: 1_level_1
0,3390
1,719
2,315


In [190]:
result_df = pd.DataFrame({'Target(Reference)':[2209,1421,794],'kmeans_cluster':[3503,741,180],'gmm_cluster':[3496,748,180],'agglomerative_cluster':[3390,719,315]})

In [191]:
result_df

Unnamed: 0,Target(Reference),kmeans_cluster,gmm_cluster,agglomerative_cluster
0,2209,3503,3496,3390
1,1421,741,748,719
2,794,180,180,315
