In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Loading the data
emotion_df=pd.read_csv("emotion_data/1/emotion.csv") # Loading emotion score of first student
gaze_df=pd.read_csv("emotion_data/1/gaze.csv") # Loading gaze score of first student
transcript_df=pd.read_csv("transcript_data/1.csv") # Loading transcript score of first student
gaze_df=gaze_df.drop(gaze_df.index[-1]) # Due to one row difference between emotion and gaze for 1st student and to keep unniformity
lengths1=[len(emotion_df.index)] # To keep track of number of rows of each student's emotion and gaze 
lengths2=[len(transcript_df.index)] # To keep track of number of rows of each student for transcript score 

In [3]:
for i in range(2,11):
    df1=pd.read_csv("emotion_data/"+str(i)+"/emotion.csv")
    df2=pd.read_csv("emotion_data/"+str(i)+"/gaze.csv")
    df3=pd.read_csv("transcript_data/"+str(i)+".csv")
    # Merging the dataframes
    emotion_df=pd.merge(emotion_df,df1,how='outer') 
    gaze_df=pd.merge(gaze_df,df2,how='outer')
    transcript_df=pd.merge(transcript_df,df3,how='outer')
    lengths1.append(len(df1.index)) # Storing the number of rows of each student's emotion and gaze
    lengths2.append(len(df3.index)) # Storing the number of rows of each student's transcript score

In [4]:
# Dropping the unwanted and redundant columns
emotion_df=emotion_df.drop(['movie_id','image_seq','dominant_emotion'],axis=1) 
gaze_df=gaze_df.drop(['movie_id','image_seq','blink','eye_offset'],axis=1)
transcript_df=transcript_df.iloc[:,9:]

In [5]:
result1_df=pd.concat([emotion_df,gaze_df],axis=1) # result1 has the concatenated dataframe of emotion_df and gaze_df
result2_df=transcript_df #result2 has the dataframe transcript_df

In [6]:
result1_df

Unnamed: 0,angry,disgust,fear,happy,sad,surprise,neutral,gaze
0,4.317350,5.942640e-04,2.879790,1.650350e+00,2.779980,0.600814,87.77110,1
1,53.225300,2.981640e+00,12.736800,1.523470e+00,1.051320,27.216800,1.26462,1
2,8.796510,2.946810e-02,2.968160,1.683150e+01,39.884600,0.279335,31.21050,1
3,9.453030,1.067780e-01,1.553080,2.093010e+01,3.503870,0.909426,63.54370,1
4,56.000200,4.152410e-06,0.162231,5.583580e+00,0.197026,12.807600,25.24940,1
...,...,...,...,...,...,...,...,...
742,21.623500,3.223740e-01,55.701200,1.837300e+00,14.471100,2.007100,4.03747,0
743,0.483833,8.153230e-05,83.415300,2.197600e+00,12.474100,0.059187,1.36993,1
744,0.175224,4.728190e-10,13.272400,1.959540e-09,63.701500,0.000002,22.85090,1
745,0.326095,2.007640e-05,1.177400,3.822260e-02,33.006200,0.011101,65.44100,1


In [7]:
result2_df

Unnamed: 0,no_speech_prob,positive,negative,neutral,confident,hesitant,concise,enthusiastic,speech_speed
0,0.635880,0.580265,0.152281,0.267454,0.846701,0.845698,0.635805,0.647783,2.517986
1,0.635880,0.550327,0.189263,0.260410,0.679283,0.733701,0.544145,0.417390,3.217822
2,0.635880,0.639860,0.111150,0.248990,0.902729,0.834620,0.715861,0.700062,2.868852
3,0.635880,0.441894,0.399186,0.158919,0.774308,0.813044,0.522462,0.279916,3.750000
4,0.635880,0.236254,0.532010,0.231735,0.286049,0.561375,0.334381,0.197305,3.541667
...,...,...,...,...,...,...,...,...,...
169,0.036832,0.737435,0.063301,0.199264,0.821343,0.204142,0.422417,0.254029,3.169014
170,0.036832,0.594038,0.206492,0.199470,0.455449,0.631635,0.221028,0.127612,2.884615
171,0.036832,0.587039,0.207191,0.205771,0.127398,0.436416,0.020206,0.275292,2.866242
172,0.101272,0.542674,0.259974,0.197352,0.539320,0.450221,0.284381,0.104140,2.571429


In [8]:
# to_numpy convert a dataframe to a numpy array
result1=result1_df.to_numpy()
result2=result2_df.to_numpy()

In [9]:
from sklearn.preprocessing import MinMaxScaler # This is required for Min-Max Scaling
 
scaler = MinMaxScaler()
model=scaler.fit(result1)
scaled_result1=model.transform(result1) # This is the scaled result1

scaler = MinMaxScaler()
model=scaler.fit(result2)
scaled_result2=model.transform(result2)  # This is the scaled result2

In [12]:
# We will have 2 clusters, 1-student should be recruited and 0-student should not be recruited
# For sake of uniformity, if there is a combination of desirable emotion and gaze, or ,transcript it will always belong to cluster 1
# Ideal case arrays, to make sure ideal combinations of emotions, gaze and transcript always in cluster 1
ideal1 = [0,0,0,1,0,0.5,1,1] # Array with ideal values of emotion and gaze
ideal2 = [0,1,0,1,1,0,1,1,0.5] # Array with ideal values of transcript scores

In [13]:
from sklearn.metrics import davies_bouldin_score # To analyse the perfomance of the clustering algorithms

In [14]:
from sklearn.cluster import KMeans

In [15]:
# random_state has been initialised to 0 so that centroids chosen remain the same everytime the code is run
# n_init is the number of times the algorithm will be run with different initialisations and choose the one with lowest inertia
kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto")
kmeans.fit(scaled_result1) # Computes K-Means Clustering for scaled_result1
kmeans.predict([ideal1]) # To check if ideal case is giving 0 or 1

array([1])

In [16]:
predictions=kmeans.labels_ # These are the cluster labels for each row, by K-Means
db_index_kmeans_1 = davies_bouldin_score(scaled_result1, predictions) # Stores value of DB Index when using K-Means for scaled_result1
ans1_kmeans=[] # The final prediction scores of each student using emotion and gaze and K-Means is stored in this list
i=0
for l in lengths1: # Using lengths1 to help in taking average of all rows that contribute to each student
    ans1_kmeans.append(np.average(predictions[i:i+l]))
    i+=l
ans1_kmeans

[0.3563218390804598,
 0.28735632183908044,
 0.31,
 0.9696969696969697,
 1.0,
 1.0,
 0.11494252873563218,
 0.7311827956989247,
 0.32558139534883723,
 0.12222222222222222]

In [17]:
# Similarly doing for scaled_result2
kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto")
kmeans.fit(scaled_result2)
kmeans.predict([ideal2])

array([0])

In [18]:
# Since in this case ideal values are belonging to cluster-0, we need to swap the clusters
# This is to make sure that ideal and good values always remain in cluster 1 for uniformity
predictions=kmeans.labels_
predictions

array([0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0])

In [19]:
predictions=1-predictions # Simple way to swap the clusters if only 2 clusters are present
predictions # Now ideal case will belong to cluster 1

array([1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1])

In [20]:
db_index_kmeans_2 = davies_bouldin_score(scaled_result2, predictions) # DB Index when using K-Means for scaled_result2
ans2_kmeans=[] # The final prediction scores of each student using transcript scores and K-Means is stored in this list
i=0
for l in lengths2: # Using lengths2 to help in taking average of all rows that contribute to each student
    ans2_kmeans.append(np.average(predictions[i:i+l]))
    i+=l
ans2_kmeans

[0.6666666666666666,
 0.5263157894736842,
 0.35714285714285715,
 0.5263157894736842,
 0.47058823529411764,
 0.4444444444444444,
 0.46153846153846156,
 0.5625,
 0.4444444444444444,
 0.35294117647058826]

In [21]:
# Gaussian Misture Model

In [22]:
from sklearn.mixture import GaussianMixture

In [32]:
# n_components tells the algorithm how many clusters to make
gaussian_model = GaussianMixture(n_components=2)
gaussian_model.fit(scaled_result1) # train the data
gaussian_result = gaussian_model.predict(scaled_result1) # assign each data point to a cluster
gaussian_model.predict([ideal1]) # To check if ideal case is giving 0 or 1

array([1], dtype=int64)

In [33]:
db_index_gaussian_1 = davies_bouldin_score(scaled_result1, gaussian_result) # DB Index when using GMM for scaled_result1
ans1_gaussian=[]  # The final prediction scores of each student using emotion, gaze and GMM is stored in this list
i=0
for l in lengths1:
    ans1_gaussian.append(np.average(gaussian_result[i:i+l]))
    i+=l
ans1_gaussian

[0.6206896551724138,
 0.6091954022988506,
 0.45,
 0.7878787878787878,
 1.0,
 1.0,
 0.7816091954022989,
 0.946236559139785,
 0.9651162790697675,
 0.7333333333333333]

In [36]:
# Similarly doing for scaled_result2
gaussian_model = GaussianMixture(n_components=2)
gaussian_model.fit(scaled_result2) # train the data
gaussian_result = gaussian_model.predict(scaled_result2) # assign each data point to a cluster
gaussian_model.predict([ideal2])

array([1], dtype=int64)

In [37]:
db_index_gaussian_2 = davies_bouldin_score(scaled_result2, gaussian_result) # DB Index when using GMM for scaled_result2
ans2_gaussian=[] # The final prediction scores of each student using transcript score and GMM is stored in this list
i=0
for l in lengths2:
    ans2_gaussian.append(np.average(gaussian_result[i:i+l]))
    i+=l
ans2_gaussian

[0.16666666666666666,
 0.3157894736842105,
 0.5714285714285714,
 0.42105263157894735,
 0.47058823529411764,
 0.3333333333333333,
 0.38461538461538464,
 0.5,
 0.4444444444444444,
 0.5882352941176471]

In [38]:
# Agglomerative Clustering 

In [39]:
from sklearn.cluster import AgglomerativeClustering

In [40]:
# n_clusters tells the algorithm how many clusters are there
agglomerative_model = AgglomerativeClustering(n_clusters=2)
scaled_result1_with_ideal=np.row_stack([ideal1,scaled_result1]) # Including the ideal case to check whcih cluster it belongs
agglomerative_result = agglomerative_model.fit_predict(scaled_result1_with_ideal) # Performing prediction 
agglomerative_result[0]

0

In [41]:
# As ideal case belongs to cluster 0, clusters have to be swapped after prediction
agglomerative_result = agglomerative_model.fit_predict(scaled_result1) # Predicting for the scaled_result1
agglomerative_result=1-agglomerative_result # Now ideal case will belong to cluster 1

In [42]:
db_index_agglomerative_1 = davies_bouldin_score(scaled_result1, agglomerative_result) # DB Index when using Agglomerative for scaled_result1
ans1_agglomerative=[]  # The final prediction scores of each student using emotion, gaze and Agglomerative is stored in this list
i=0
for l in lengths1:
    ans1_agglomerative.append(np.average(agglomerative_result[i:i+l]))
    i+=l
ans1_agglomerative

[0.6206896551724138,
 0.6091954022988506,
 0.45,
 0.7878787878787878,
 1.0,
 1.0,
 0.7816091954022989,
 0.946236559139785,
 0.9651162790697675,
 0.7333333333333333]

In [43]:
# Similarly doing for scaled_result2
agglomerative_model = AgglomerativeClustering(n_clusters=2)
scaled_result2_with_ideal=np.row_stack([ideal2,scaled_result2]) # Including the ideal case to check whcih cluster it belongs
agglomerative_result = agglomerative_model.fit_predict(scaled_result2_with_ideal) # Performing prediction 
agglomerative_result[0]

0

In [44]:
# As ideal case belongs to cluster 0, clusters have to be swapped after prediction
agglomerative_result = agglomerative_model.fit_predict(scaled_result2) # Predicting for the scaled_result1
agglomerative_result=1-agglomerative_result # Now ideal case will belong to cluster 1

In [45]:
db_index_agglomerative_2 = davies_bouldin_score(scaled_result2, agglomerative_result)  # DB Index when using Agglomerative for scaled_result2
ans2_agglomerative=[] # The final prediction scores of each student using transcript score and Agglomerative is stored in this list
i=0
for l in lengths2:
    ans2_agglomerative.append(np.average(agglomerative_result[i:i+l]))
    i+=l
ans2_agglomerative

[0.8888888888888888,
 0.8947368421052632,
 0.6785714285714286,
 0.6842105263157895,
 0.7058823529411765,
 0.8333333333333334,
 0.7692307692307693,
 0.5625,
 0.8888888888888888,
 0.7647058823529411]

In [46]:
# Now we need to check which clustering algorithm works best. 
# The algortihm with the lowest Davies-Bouldin Index is the best algorithm

In [47]:
db_index_kmeans_1

1.2723514577449737

In [48]:
db_index_kmeans_2

1.7255051146526268

In [49]:
db_index_gaussian_1

1.1430053187993996

In [50]:
db_index_gaussian_2

1.8186697617003467

In [51]:
db_index_agglomerative_1

1.1430053187993996

In [52]:
db_index_agglomerative_2

1.5727742568753424

In [53]:
# For both the datasets, agglomerative clustering is having the lowest DB Index
# So it is the best algorithm

In [54]:
final_ans = [[x,y] for x, y in zip(ans1_agglomerative, ans2_agglomerative)] # Stores the final predicted recruitability of all 10 students
# First element shows the recruitability of student based on their emotion and gaze
# Second element shows the recruitability of student based on their transcript score
# If value is closer to 1, then student is more suitable for recruitment
final_ans 

[[0.6206896551724138, 0.8888888888888888],
 [0.6091954022988506, 0.8947368421052632],
 [0.45, 0.6785714285714286],
 [0.7878787878787878, 0.6842105263157895],
 [1.0, 0.7058823529411765],
 [1.0, 0.8333333333333334],
 [0.7816091954022989, 0.7692307692307693],
 [0.946236559139785, 0.5625],
 [0.9651162790697675, 0.8888888888888888],
 [0.7333333333333333, 0.7647058823529411]]