# Clustering of OULAD dataset

In [1]:
import collections
import itertools
import os
import subprocess
import time

from IPython.display import clear_output
import graphviz
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd
import pylab as pl
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import Birch
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import jaccard_score
from sklearn_extra.cluster import KMedoids
from sklearn.tree import export_graphviz

%matplotlib inline
sns.set()
pd.options.display.max_columns = None

In [2]:
import tools.filter_oulad as filter_oulad
from tools.load_oulad import dataset_dict
from tools.validation_oulad import customClusteringScore

In [3]:
code_module = 'CCC'
code_presentation = '2014B'
oneCourse = filter_oulad.getOneCourse(dataset_dict, code_module, code_presentation)
final_df = filter_oulad.restructure(oneCourse, 14)
encoders = filter_oulad.cleanAndMap(final_df)
display(final_df)

Unnamed: 0_level_0,score_mean,score_sum,date_submitted_mean,is_banked_sum,assessment_type_CMA_count,assessment_type_TMA_count,date_mean,weight_mean,weight_sum,gender_first,region_first,highest_education_first,imd_band_first,age_band_first,num_of_prev_attempts_first,studied_credits_first,disability_first,final_result_first,date_registration_first,date_unregistration_first,sum_click_mean,sum_click_sum,activity_type_subpage,activity_type_url,activity_type_homepage,activity_type_resource,activity_type_oucontent,activity_type_quiz,activity_type_forumng,activity_type_oucollaborate
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
28418,0.0,0.0,-1.0,0.0,0,0,9.027027,0.0,0.0,0,11,2,25,0,0,30,0,1,-37.0,241.0,2.000000,74.0,16,1,4,10,1,5,0,0
29764,0.0,0.0,-1.0,0.0,0,0,-1.878788,0.0,0.0,1,0,2,55,0,0,90,0,3,-34.0,241.0,4.848485,160.0,6,3,8,2,6,7,1,0
29820,0.0,0.0,-1.0,0.0,0,0,1.578947,0.0,0.0,1,0,3,45,0,0,60,0,2,-57.0,241.0,1.644737,125.0,15,2,15,26,2,3,13,0
40333,0.0,0.0,-1.0,0.0,0,0,1.573770,0.0,0.0,1,4,3,5,1,0,30,0,0,-30.0,17.0,2.360656,144.0,17,7,13,12,3,5,4,0
40604,0.0,0.0,-1.0,0.0,0,0,-5.242424,0.0,0.0,1,2,2,0,1,0,30,0,2,-17.0,241.0,1.772727,117.0,21,3,16,12,7,3,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2681198,0.0,0.0,-1.0,0.0,0,0,-6.363636,0.0,0.0,1,0,1,75,1,0,90,0,0,-225.0,133.0,2.136364,47.0,4,1,7,4,2,4,0,0
2686578,0.0,0.0,-1.0,0.0,0,0,4.458333,0.0,0.0,1,6,2,65,0,0,90,0,3,-23.0,241.0,1.166667,28.0,6,1,4,8,1,4,0,0
2692327,0.0,0.0,-1.0,0.0,0,0,-7.690909,0.0,0.0,1,4,2,0,1,0,60,0,3,-25.0,241.0,2.500000,275.0,58,9,12,19,1,10,1,0
2697181,0.0,0.0,-1.0,0.0,0,0,0.345133,0.0,0.0,0,6,3,65,0,0,40,0,2,-186.0,241.0,3.185841,360.0,33,3,16,11,2,25,23,0


In [4]:
# some information about the final_df
print(final_df.describe(include='all'))
print('REGION REPARTITION')
print(final_df.groupby('region_first').size())
print('EDUCATION REPARTITION')
print(final_df.groupby('highest_education_first').size())
print('FINAL RESULT REPARTITION')
print(final_df['final_result_first'].value_counts())
# to check that we don't have NaNs!
print('final_df contains %i NaNs' % sum(final_df.isna().sum()))

       score_mean  score_sum  date_submitted_mean  is_banked_sum  \
count      1577.0     1577.0               1577.0         1577.0   
mean          0.0        0.0                 -1.0            0.0   
std           0.0        0.0                  0.0            0.0   
min           0.0        0.0                 -1.0            0.0   
25%           0.0        0.0                 -1.0            0.0   
50%           0.0        0.0                 -1.0            0.0   
75%           0.0        0.0                 -1.0            0.0   
max           0.0        0.0                 -1.0            0.0   

       assessment_type_CMA_count  assessment_type_TMA_count    date_mean  \
count                     1577.0                     1577.0  1577.000000   
mean                         0.0                        0.0     1.723750   
std                          0.0                        0.0     5.068600   
min                          0.0                        0.0   -18.000000   
25%    

## Clustering

#### IDEA:
- imagine the course just started - we are at week 2
- we want to identify students which might Fail/Withdraw to propose them help
- to start of simple we don't try to predict WHEN a student will Fail/Withdraw and we ignore all student interactions with other courses
- we suppose that students that Fail/Withdraw can be distinguished by their behaviour and features but we don't know which ones in advance
- with clustering we can identify groups of student that are similar
- we suppose that students that have similar behaviour / features would have similar outcomes

- we want to measure if the resulting groups could provide useful information for a set of prediction algorithms thereby improving their predictions 

- how does the quality of the generated groups impact the predictors?

- could a consensus clustering algorithm produce an even better set of groups to improve even more the prediction?
- how does the clustering algorithms generalise for differenes courses / the same course the next year?
- if each time the optimal hyper params are different - could the consensus clustering 
   algorithm solve this issue by combining multiple choices of hyper params at the same time?

#### Separate response label from training set

In [4]:
# another solution would be to cluster a previous course (training) and use the current one for
#   prediction ... for now we want just to measure the clustering quality on all the data
# train, test = train_test_split(final_df, random_state=0)
scaler = MinMaxScaler()
trainX = final_df.drop(['final_result_first'], axis=1)
trainX = scaler.fit_transform(trainX)
testY = final_df['final_result_first']

#### Used clustering algorithms and their hyperparameters

In [5]:
import tools.clustering_parameters as params
clustAlgos = {
    "kMeans": {
        'obj': KMeans, 
        'params': params.kmeansParams,
        'paramRanges': {'n_clusters' : range(3, 4)}
    },
    "kMedoits": {
        'obj': KMedoids, 
        'params': params.kMedoidsParams,
        'paramRanges': {'n_clusters' : range(2, 4)}
    },
    "SpectralClustering": {
        'obj': SpectralClustering,
        'params': params.spectralClusteringParams,
        'paramRanges': {'n_clusters' : range(3, 5)}
    },
#     "AgglomerativeClustering": {
#         'obj': AgglomerativeClustering,
#         'params': params.agglomerativeClusteringParams,
#         'paramRanges': {'n_clusters' : range(2, 4)}
#     },
#     "Birch": {
#         'obj': Birch, 
#         'params': params.birchParams,
#         'paramRanges': {'n_clusters' : range(2, 4)}
#     },
    
    # density based - don't use nb_clusters
#     "DBSCAN": {'obj': DBSCAN, 'params': params.dbscanParams},
}

#### Computing the baseClusterings

In [6]:
# 1 Generate clustering ensemble of the dataset and store the clustering vectors in a 
#   list BaseClusterings
baseClusterings = pd.DataFrame(index=testY.index)
results = pd.DataFrame(columns=['k', 'algo', 'type', 'score'])
stepCount = 0
totalStepsCount = sum([len(clustAlgos[key]['params']) * \
                       sum([len(clustAlgos[key]['paramRanges'][innerkey]) \
                            for innerkey in clustAlgos[key]['paramRanges']]) \
                            for key in clustAlgos])

for clustAlgoKey in clustAlgos:
    for paramsKey in clustAlgos[clustAlgoKey]['params']:
        for rangedParamKey in clustAlgos[clustAlgoKey]['paramRanges']:
            for rangedParamValue in clustAlgos[clustAlgoKey]['paramRanges'][rangedParamKey]:
                # for for for for for for for for ...
                clear_output(wait=True)
                clustAlgos[clustAlgoKey]['params'][paramsKey][rangedParamKey] = \
                                rangedParamValue
                clustAlgo = clustAlgos[clustAlgoKey]['obj'](\
                                **clustAlgos[clustAlgoKey]['params'][paramsKey])
                clustAlgo.fit(trainX)
                # we don't want to predict - for now only evaluate the quality of the clustering
                # clustAlgo_predict = pd.Series(clustAlgo.predict(trainX), name=clustAlgoKey, \
                # index=testY.index)
                clustAlgo_labels = pd.Series(clustAlgo.labels_, name=clustAlgoKey, \
                                             index=testY.index)
                baseClusterings.insert(len(baseClusterings.columns), str(stepCount+1), \
                                       clustAlgo_labels)

                results = results.append({
                    'k': rangedParamValue, 
                    'algo': clustAlgoKey, 
                    'type': paramsKey, 
                    'score': customClusteringScore(clustAlgo_labels, testY) 
                    }, ignore_index=True)
                stepCount += 1
                print('step %i/%i (%i%s)' % (stepCount, totalStepsCount, \
                                             round(100 * stepCount/totalStepsCount,2), '%'))

nb_lines = 10
resultTable = pd.DataFrame(index=range(0,nb_lines))
for group in results.sort_values(by='score', ascending=False).groupby(['algo']):
    if len(group[1]) < nb_lines:
        group = (group[0], group[1].append([x for x in [0] * nb_lines]))
    
    resultTable.insert(len(resultTable.columns), "%s_k" % \
                       (group[0]), group[1].head(nb_lines)['k'].values)
    resultTable.insert(len(resultTable.columns), "%s_type" % \
                       (group[0]), group[1].head(nb_lines)['type'].values)
    resultTable.insert(len(resultTable.columns), "%s_score" % \
                       (group[0]), group[1].head(nb_lines)['score'].values)

print("BASE CLUSTERINGS")
display(baseClusterings)
print("CUSTOM CLUSTERING SCORE")
display(resultTable)

step 6/6 (100%)
BASE CLUSTERINGS


Unnamed: 0_level_0,1,2,3,4,5,6
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
28418,2,2,1,1,2,0
29764,0,0,0,0,0,1
29820,0,0,0,0,0,1
40333,1,1,0,0,1,3
40604,0,0,0,0,0,1
...,...,...,...,...,...,...
2681198,1,1,0,0,1,3
2686578,0,0,1,2,1,1
2692327,0,0,0,0,0,1
2697181,2,2,1,2,2,0


CUSTOM CLUSTERING SCORE


Unnamed: 0,SpectralClustering_k,SpectralClustering_type,SpectralClustering_score,kMeans_k,kMeans_type,kMeans_score,kMedoits_k,kMedoits_type,kMedoits_score
0,3.0,affinity=laplacian,0.400127,3.0,vanilla,0.302473,3.0,vanilla,0.381103
1,4.0,affinity=laplacian,0.306278,3.0,tol=5,0.302473,2.0,vanilla,0.164236
2,,,,,,,,,
3,,,,,,,,,
4,,,,,,,,,
5,,,,,,,,,
6,,,,,,,,,
7,,,,,,,,,
8,,,,,,,,,
9,,,,,,,,,


### MultiCons

In [9]:
# D= {1,2,3,4,5,6,7,8,9} partitioned using five base clusterings into the five partitions:
# P1 = {{1,2,3},{4,5,6,7,8,9}},
# P2 = {{1,2,3},{4,5,6,7,8,9}},
# P3 = {{1,2,3,4,5},{6,7},{8,9}},
# P4 = {{4,5,6,7}, {1,2,3},{8,9}}
# P5 = {{4,5,6,7},{1,2,3},{8,9}}
clust = pd.DataFrame(index=range(1,10))
membershipMatrix = pd.DataFrame(index=range(1,10))
clust.insert(len(clust.columns), '1', [0,0,0,1,1,1,1,1,1])
clust.insert(len(clust.columns), '2', [0,0,0,1,1,1,1,1,1])
clust.insert(len(clust.columns), '3', [0,0,0,0,0,1,1,2,2])
clust.insert(len(clust.columns), '4', [1,1,1,0,0,0,0,2,2])
clust.insert(len(clust.columns), '5', [1,1,1,0,0,0,0,2,2])


In [10]:
baseClusterings = clust

#### Building the Membership matrix

In [7]:
# there should be a much better way to construct the membership matrix...
# for now let's use a naive implementation

# 3 Build the cluster membership matrix M
def buildMembershipMatrix(baseClusterings):
    ''' Computes and returns the Membership matrix'''
    membershipMatrix = pd.DataFrame(index=baseClusterings.index)
    for col in baseClusterings.columns:
        for partition in np.sort(baseClusterings[col].unique()):
            membershipMatrix.insert(len(membershipMatrix.columns), '%sP%i' % (col, partition), baseClusterings[col] == partition)
    return membershipMatrix

membershipMatrix = buildMembershipMatrix(baseClusterings)
membershipMatrix.astype(int)

Unnamed: 0_level_0,1P0,1P1,1P2,2P0,2P1,2P2,3P0,3P1,4P0,4P1,4P2,5P0,5P1,5P2,6P0,6P1,6P2,6P3
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
28418,0,0,1,0,0,1,0,1,0,1,0,0,0,1,1,0,0,0
29764,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,0
29820,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,0
40333,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,1
40604,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2681198,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,1
2686578,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0
2692327,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,0
2697181,0,0,1,0,0,1,0,1,0,0,1,0,0,1,1,0,0,0


#### Generate FCPs

In [8]:
# Thanks to https://github.com/slide-lig/plcmpp for the implementation of the LCM algorithm
# It's cloned and build in the ./FCI directory

start = time.time()
groups = list(range(0,len(membershipMatrix.columns)))
transactionList = [(membershipMatrix.iloc[x,:] * groups)[membershipMatrix.iloc[x,:]] for x in range(0, len(membershipMatrix))]

file = open('./FCI/input.txt','w')
for line in transactionList:
    file.write(str(list(line)).replace('[', '').replace(',', '').replace(']', '') + '\n')

file.close()

os.chdir(os.getcwd() + '/FCI')
# 4 Generate FCPs from M for minsupport = 0
subprocess.call("./runLCM.sh")
os.chdir(os.getcwd()[:-3])

file = open('./FCI/output.txt','r')
FCPs = []
for line in file:
    line = line.replace('\n', '')
    freq = line[:line.find('	')]
    line = np.array(list(map(int, line[line.find('	')+1:].split(' '))))
    line.sort()
    FCPs.append(list(membershipMatrix.columns[line]))
    
file.close()
end = time.time()
print('Python elapsed time: ' + str(round((end - start), 3) * 1000) + ' ms')
# 5 Sort the FCPs in ascending order according to the size of the instance sets
FCPs.sort(key = len, reverse = True)
FCPs

Python elapsed time: 773.0 ms


[['1P1', '2P1', '3P1', '4P2', '5P1', '6P1'],
 ['1P2', '2P2', '3P1', '4P2', '5P2', '6P0'],
 ['1P2', '2P2', '3P0', '4P2', '5P2', '6P0'],
 ['1P2', '2P2', '3P1', '4P0', '5P2', '6P0'],
 ['1P2', '2P2', '3P0', '4P0', '5P2', '6P0'],
 ['1P2', '2P2', '3P1', '4P1', '5P2', '6P0'],
 ['1P2', '2P2', '3P0', '4P1', '5P2', '6P0'],
 ['1P1', '2P1', '3P1', '4P2', '5P0', '6P2'],
 ['1P1', '2P1', '3P0', '4P2', '5P0', '6P2'],
 ['1P1', '2P1', '3P0', '4P0', '5P0', '6P2'],
 ['1P1', '2P1', '3P1', '4P1', '5P0', '6P2'],
 ['1P1', '2P1', '3P0', '4P1', '5P0', '6P2'],
 ['1P0', '2P0', '3P0', '4P2', '5P0', '6P2'],
 ['1P1', '2P1', '3P1', '4P1', '5P1', '6P1'],
 ['1P0', '2P0', '3P1', '4P1', '5P0', '6P2'],
 ['1P0', '2P0', '3P1', '4P0', '5P1', '6P1'],
 ['1P1', '2P1', '3P1', '4P2', '5P0', '6P1'],
 ['1P1', '2P1', '3P0', '4P0', '5P1', '6P1'],
 ['1P2', '2P2', '3P0', '4P0', '5P2', '6P2'],
 ['1P2', '2P2', '3P0', '4P1', '5P2', '6P2'],
 ['1P0', '2P0', '3P0', '4P0', '5P0', '6P2'],
 ['1P2', '2P2', '3P0', '4P2', '5P2', '6P2'],
 ['1P0', '

#### Define helper functions

In [12]:
def consensusFunction10(biClust):
    '''made from the Algorithm 10'''
    hasChanged = True
    while hasChanged:
#       When hasChanged is False after the iteration
#       All sets in biClust are unique
        hasChanged = False
        i = 0
        N = len(biClust)
#       using while loop because for i in range(1, N) would loop over a copy of 
#       1,2,...,N => 1, 2,...N' and would not change if we change N
        while i < N:
            bi = biClust[i]
#           as the intersection is a symetric operation we could omit half of
#           the comparaisons? using j = i + 1 insead of j = 0
            j = i + 1
            while j < N:
#               ommiting this part as with j = i + 1 we don't enter this if statement
#               if i == j:
#                   j += 1
#                   continue
                bj = biClust[j]
                intrscSz = bi & bj
                if len(intrscSz) == 0:
                    j += 1
                elif len(intrscSz) == len(bi):
                    # Bi⊂Bj
                    hasChanged = True
                    del biClust[i]
                    N -= 1
                elif len(intrscSz) == len(bj):
                    # Bj⊂Bi
                    hasChanged = True
                    del biClust[j]
                    N -= 1
                else:
                    biClust[j] = bi | bj
                    hasChanged = True
                    del biClust[i]
                    N -= 1

            i += 1

def assignLabel(biClust, maxDT, consVctrs):
    '''
    8 - Assign a label to each set in BiClust to build the first consensus vector and 
    store it in a list of vectors ConsVctrs
    * for convenience, ConsVctrs is a list of dictionaries that will be transformed later on 
    * in their corresponding consensus vectors
    * storing {'maxDT=5|0': {1, 2, 3}, 'maxDT=5|1': {4, 5} ...
    * instead of [0, 0, 0, 1, 1,...]
    '''
    temp = {}
    for i, partition in enumerate(biClust):
        temp['maxDT=%i|%i' % (maxDT,i)] = partition
    consVctrs.append(temp)

def jaccard(x, y):
    '''
    x and y are two dictionaries containing sets
    returns the jaccard_score (|x∩y|/|x∪y|)
    '''
    xSet = [frozenset(x[key]) for key in x]        
    ySet = [frozenset(y[key]) for key in y]
    smallerOrEqualSet, biggerOrEqualSet = (xSet, ySet) if len(xSet) < len(ySet) else (ySet, xSet)
    unionSet = set()
    intersectionSet = set()
    for i in range(0, len(biggerOrEqualSet)):
        unionSet.add(biggerOrEqualSet[i])
        if i < len(smallerOrEqualSet):
            unionSet.add(biggerOrEqualSet[i] | smallerOrEqualSet[i])
            intersectionSet.add(biggerOrEqualSet[i] & smallerOrEqualSet[i])
    return len(intersectionSet) / len(unionSet)

#### MultiCons - part after FCP are generated

In [13]:
# 6 MaxDT←length(BaseClusterings)
maxDT = len(baseClusterings.columns)
# 7 BiClust ← {instance sets of FCPs built from MaxDT base clusters}
# here BiClust is a list of sets of FCPs build from maxDT baseClusters
biClust = []
filteredFCP = list(filter(lambda x: len(x) == maxDT, FCPs))
filteredFCP.sort(key = str)
consVctrs = []
for tempSet in filteredFCP:
    isInSet = True
    for col in tempSet:
        isInSet = isInSet & membershipMatrix[col]
        
    biClust.append(set(baseClusterings.index[isInSet].tolist()))
# 8 Assign a label to each set in BiClust to build the first consensus vector and store 
# it in a list of vectors ConsVctrs
assignLabel(biClust, maxDT, consVctrs)
# 9 Build the remaining consensuses
# 10 for DT = (MaxDT−1) to 1 do
for dt in range(maxDT - 1, 0, -1): 
    filteredFCP = list(filter(lambda x: len(x) == dt, FCPs))
    filteredFCP.sort(key = str)
    for tempSet in filteredFCP:
        isInSet = True
        for col in tempSet:
            isInSet = isInSet & membershipMatrix[col]
#       11 BiClust ← BiClust ∪ {instance sets of FCPs built from DT base clusters}
        biClust.append(set(baseClusterings.index[isInSet].tolist()))
#   12 Call the consensus function (Algo. 10)
    consensusFunction10(biClust)
#   13 Assign a label to each set in BiClust to build a consensus vector and add it to 
#   ConsVctrs
    assignLabel(biClust, dt, consVctrs)

# 15 Remove similar consensuses
# 16 ST ← Vector of ‘1’s of length MaxDT
st = [1] * maxDT
# 17 for i = MaxDT to 2 do
# in python the index starts with 0 -> using maxDT - 1 to 1
i = maxDT - 1
while i > 0:
#   18 Vi ← ith consensus in ConsVctrs
    vi = consVctrs[i]
#   19 for j = (i−1) to 1 do
#   in python the index starts with 0 -> (i−1) to 0
    j = i - 1
    while j >= 0:
#       20 Vj ← jth consensus in ConsVctrs
        vj = consVctrs[j]
        if jaccard(vi, vj) == 1:
            st[i] += 1
            del st[j]
            del consVctrs[j]
            i -= 1
            
        j -= 1
        
    i -= 1
    
# 27 Find the consensus the most similar to the ensemble
# 28 L ← length(ConsVctrs)
L = len(consVctrs)
# 29 TSim ← Vector of ‘0’s of lengthL
tSim = [0] * L
# 30 for i = 1 to L do
for i in range(0, L):
#   31 Ci ← ith consensus in ConsVctrs
    ci = consVctrs[i]
#   here tempArray will be ci converted from dict 
#   {'maxDT=5|0': {1, 2, 3}, 'maxDT=5|1': {4, 5}, ... }
#   to array [0, 0, 0, 1, 1, ...]
    tempArray = [-1] * len(baseClusterings.index)
    for indx, key in enumerate(ci):
        for ii in list(ci[key]):
            tempArray[baseClusterings.index.tolist().index(ii)] = indx
#   now tempArray is converted and equivalent to ci in the pseudo code
#   32 for j = 1 to MaxDT do
    for j in range(maxDT):
#       33 Cj ← jth clustering in BaseClusterings
        cj = baseClusterings.iloc[:,j]
#       34 TSim[i] ← TSim[i] + Jaccard(Ci,Cj)
        tSim[i] += jaccard_score(tempArray, cj, average='macro')
#   36 Sim[i] ← TSim[i] / MaxDT
    tSim[i] /= maxDT

for i in range(0, len(tSim)):
    selectedConsensus = consVctrs[i]
#   38 RecommCons ← which.max(TSim)
#   we add 'selected' to the corresponding consensus
    isSelected = 'selected!' if i == tSim.index(max(tSim)) else ''
    clustAlgo_labels = pd.Series(name='consensus', index=baseClusterings.index, dtype='int')
    for ii, key in enumerate(selectedConsensus):
        for indx in selectedConsensus[key]:
            clustAlgo_labels[indx] = ii
            
#     score = customClusteringScore(encoders, clustAlgo_labels, testY)
#     print('stability=%i similarity=%f customScore=%f k=%i %s' % \
#           (st[i], tSim[i], score, clustAlgo_labels.nunique(), isSelected))
    print('stability=%i similarity=%f k=%i repartition=%s %s' % \
          (st[i], tSim[i], clustAlgo_labels.nunique(), str(consVctrs[i]), isSelected))

stability=1 similarity=0.002712 k=44 repartition={'maxDT=6|0': {555008, 600073, 2241036, 625684, 122392, 2156058, 556587, 2433069, 625714, 1763378, 553021, 29764, 342089, 598113, 625782, 29820, 572036, 634500, 608904, 502929, 623250, 633493, 40604, 631455, 324257, 491170, 242858, 588460, 588473, 575166, 502465, 550092, 515278, 556246, 2134237, 622302, 582879, 2692327, 569080, 634618, 556295, 576780, 613133, 2087182, 601872, 568084, 2602264, 625953, 634157, 2258228, 523061, 626501, 607052, 615246, 630606, 129878, 611176, 2239848, 269676, 473965, 1997164, 383347, 628086, 571776, 363906, 1750405, 577414, 509839, 625552, 631699, 600987, 576413, 633256, 557995, 580525, 628663, 602042, 594371, 630211, 2131908, 612814, 634853, 627691, 117232, 2486769, 285170, 578034, 548854, 1891323}, 'maxDT=6|1': {585093, 615437, 565399, 407453, 124193, 515107, 206245, 160300, 236205, 625453, 2109119, 539328, 319042, 622787, 329673, 618571, 609229, 629710, 478550, 554205, 180976, 1753458}, 'maxDT=6|2': {6241

In [11]:
# trying to improve simirarity by rearrranging the partitions
# for example = for clustering [0,0,0,1,1,2,2] trying also:
# [1,1,1,0,0,2,2] , [2,2,2,1,1,0,0] , [0,0,0,2,2,1,1], etc...
# WARNING: This is REALLY REALLY SLOW ... TODO -> improve performance!

L = len(consVctrs)
tSim = [0] * L
orderedConsVctrs = []
for i in range(0, L):
    ci = consVctrs[i]
    tempSimilaritysVec = []
    permutations = []
    for perm in itertools.permutations(ci):
        tempSimilarity = 0
        tempArray = [-1] * len(baseClusterings.index)
        indx = 0
        for key in perm:
            for ii in list(ci[key]):
                tempArray[baseClusterings.index.tolist().index(ii)] = indx
            indx += 1
            
        for j in baseClusterings.columns:
            cj = baseClusterings[j]
            tempSimilarity += jaccard_score(cj, tempArray, average='macro')
            
        tempSimilaritysVec.append(tempSimilarity/maxDT)
        permutations.append(perm)
    tSim[i] = max(tempSimilaritysVec)
    tempOrderedDict = collections.OrderedDict()
    for selectedPermutation in permutations[tempSimilaritysVec.index(tSim[i])]:
        tempOrderedDict[selectedPermutation] = ci[selectedPermutation]
    orderedConsVctrs.append(tempOrderedDict)

for i in range(0, len(tSim)):
    selectedConsensus = orderedConsVctrs[i]
    isSelected = 'SELECTED!' if i == tSim.index(max(tSim)) else ''
    clustAlgo_labels = pd.Series(name='consensus', index=baseClusterings.index, dtype='int')
    for ii, key in enumerate(selectedConsensus):
        for indx in selectedConsensus[key]:
            clustAlgo_labels[indx] = ii
            
    print('stability=%i similarity=%f k=%i repartition= %s\n      %s\n\
___________________________________________________________________' % \
          (st[i], tSim[i], clustAlgo_labels.nunique(), isSelected, str(orderedConsVctrs[i])))

KeyboardInterrupt: 

In [15]:
score = customClusteringScore(clustAlgo_labels, testY)
#     print('stability=%i similarity=%f customScore=%f k=%i %s' % \
#           (st[i], tSim[i], score, clustAlgo_labels.nunique(), isSelected))
score

0.16423589093214966

#### Selecting the data

In [None]:
code_module = 'AAA'
code_presentation = '2013J'

oneTrainingCourse = filter_oulad.getOneCourse(dataset_dict, code_module, code_presentation)
training_final_df = filter_oulad.restructure(oneTrainingCourse, 14)
training_encoders = filter_oulad.cleanAndMap(training_final_df)

scaler = MinMaxScaler()
trainX = training_final_df.drop(['final_result_first'], axis=1)
trainX = scaler.fit_transform(trainX)
trainY = training_final_df['final_result_first']

code_presentation = '2014J'

oneTestCourse = filter_oulad.getOneCourse(dataset_dict, code_module, code_presentation)
testing_final_df = filter_oulad.restructure(oneTestCourse, 14)
testing_encoders = filter_oulad.cleanAndMap(testing_final_df)

testX = testing_final_df.drop(['final_result_first'], axis=1)
testX = scaler.fit_transform(testX)
testY = testing_final_df['final_result_first']

### Advanced Deep Learning with Python Design and implement advanced next-generation AI solutions using TensorFlow and PyTorc

In [None]:
import tensorflow as tf
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))
tf.config.list_physical_devices('GPU')
# tf.test.is_built_with_cuda()


In [None]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120)  # 6*6 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net()
print(net)
params = list(net.parameters())
print(len(params))
print(params[0].size())  # conv1's .weight
input = torch.randn(1, 1, 32, 32)
out = net(input)
print(out)
torch.randn(1, 10)
net.zero_grad()
out.backward(torch.randn(1, 10))
output = net(input)
target = torch.randn(10)  # a dummy target, for example
target = target.view(1, -1)  # make it the same shape as output
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU
net.zero_grad()     # zeroes the gradient buffers of all parameters
print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)
loss.backward()
print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)
# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()    # Does the update

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)
net.to(device)

# Remember that you will have to send the inputs and targets at every step to the GPU too:
# inputs, labels = data[0].to(device), data[1].to(device)




In [None]:
import matplotlib.pyplot as plt
from matplotlib.markers import MarkerStyle
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Lambda, Input, Dense
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.models import Model

config = tf.compat.v1.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction=0.333
config.gpu_options.allow_growth=True
sess = tf.compat.v1.Session(config=config)

In [None]:
image_size = trainX.shape[1]

In [None]:

def build_vae(intermediate_dim=512, latent_dim=2):
    # encoder first
    inputs = Input(shape=(image_size,), name='encoder_input')
    x = Dense(intermediate_dim, activation='relu')(inputs)
    # latent mean and variance
    z_mean = Dense(latent_dim, name='z_mean')(x)
    z_log_var = Dense(latent_dim, name='z_log_var')(x)
    # Reparameterization trick for random sampling    
    # Note the use of the Lambda layer    
    # At runtime, it will call the sampling function
    z = Lambda(sampling, output_shape=(latent_dim,),name='z')([z_mean, z_log_var])
    # full encoder encoder model
    encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
    encoder.summary()
    
    # decoder
    latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
    x = Dense(intermediate_dim, activation='relu')(latent_inputs)
    outputs = Dense(image_size, activation='sigmoid')(x)
    # full decoder model
    decoder = Model(latent_inputs, outputs, name='decoder')
    decoder.summary()
    # VAE model
    outputs = decoder(encoder(inputs)[2])
    vae = Model(inputs, outputs, name='vae')
    # Loss function   
    # we start with the reconstruction loss
    reconstruction_loss = binary_crossentropy(inputs, outputs) * image_size
    # next is the KL divergence
    kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
    kl_loss = K.sum(kl_loss, axis=-1)
    kl_loss *= -0.5
    # we combine them in a total loss
    vae_loss = K.mean(reconstruction_loss + kl_loss)
    vae.add_loss(vae_loss)
    return encoder, decoder, vae

In [None]:
def sampling(args: tuple):
    """:param args: (tensor, tensor) mean and log of variance of    q(z|x)    """
    # unpack the input tuple
    z_mean, z_log_var = args
    # mini-batch sizem
    mb_size = K.shape(z_mean)[0]
    # latent space size
    dim = K.int_shape(z_mean)[1]
    # random normal vector with mean=0 and std=1.0
    epsilon = K.random_normal(shape=(mb_size, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

In [None]:
encoder, decoder, vae = build_vae()
vae.compile(optimizer='adam')
vae.summary()
vae.fit(trainX, epochs=50, batch_size=128, validation_data=(testX, None))
vae.predict(testX)

#### WIP Doing without understanding (for now)

In [None]:
import keras
from keras import models
from keras import layers

In [None]:
network = models.Sequential()
network.add(layers.Dense(40, activation='relu', input_shape=(trainX.shape[1],)))
# network.add(layers.Dense(1000, activation='relu'))
network.add(layers.Dense(30, activation='relu'))
network.add(layers.Dense(4, activation='softmax'))
network.compile(optimizer='rmsprop', # optimizers.RMSprop(lr=0.001)
                loss='categorical_crossentropy', # binary_crossentropy
                metrics=['accuracy'])
history = network.fit(trainX, pd.get_dummies(trainY), epochs=5, batch_size=50, 
                      validation_split=0.75,
                     validation_data=(testX, pd.get_dummies(testY)))

In [None]:
test_loss, test_acc = network.evaluate(testX, pd.get_dummies(testY))

In [None]:
print('test_acc:', test_acc)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.clf()   # clear figure
acc_values = history.history['accuracy']
val_acc_values = history.history['val_accuracy']
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
predictions = []
for value in network.predict(testX):
    predictions.append(np.argmax(value))

pd.DataFrame(confusion_matrix(testY, predictions, labels=[0,1,2,3]), \
             columns=index1, index=index2)