In [2]:
import pandas as pd
import numpy as np
import os
from random import randint

# 1 Pré-traitement des données

## 1.0 Compréhension du sujet

Score :
* 1 ou a ou A vaut 1 point
* 2 ou b ou B vaut 0 point
* 3 ou c ou C vaut 2 points

Interpretation :
* score < 10 => C
* score < 20 => B
* score < 30 => A


## 1.1 - Importation du fichier CSV

In [3]:
# Importation du jeu de données.
data = pd.read_csv('combined_csv.csv')
data

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Score,Interpretation
0,a,a,a,a,a,1,1,1,1,1,10,B
1,b,b,b,b,b,2,2,2,2,2,0,C
2,c,c,c,c,c,3,3,3,3,3,20,A
3,a,b,c,a,b,1,2,3,1,2,8,C
4,b,c,a,c,a,3,2,3,1,2,11,B
...,...,...,...,...,...,...,...,...,...,...,...,...
230,c,c,,c,c,,,3.0,3,3,14,B
231,b,a,c,,b,,2.0,,3,2,5,C
232,a,c,b,,,1.0,,3.0,2,3,8,C
233,a,n,e,a,b,v,t,Y,',1,3,C


## 1.2 Description et information du fichier CSV

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Q1              227 non-null    object
 1   Q2              223 non-null    object
 2   Q3              222 non-null    object
 3   Q4              225 non-null    object
 4   Q5              221 non-null    object
 5   Q6              223 non-null    object
 6   Q7              225 non-null    object
 7   Q8              222 non-null    object
 8   Q9              225 non-null    object
 9   Q10             227 non-null    object
 10  Score           235 non-null    int64 
 11  Interpretation  235 non-null    object
dtypes: int64(1), object(11)
memory usage: 22.2+ KB


In [5]:
print(data.shape, data.columns)

(235, 12) Index(['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Score',
       'Interpretation'],
      dtype='object')


In [6]:
data.head(10)

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Score,Interpretation
0,a,a,a,a,a,1,1,1,1,1,10,B
1,b,b,b,b,b,2,2,2,2,2,0,C
2,c,c,c,c,c,3,3,3,3,3,20,A
3,a,b,c,a,b,1,2,3,1,2,8,C
4,b,c,a,c,a,3,2,3,1,2,11,B
5,c,b,a,c,b,2,3,1,3,2,10,B
6,g,z,ju,d,u,4,7,2,3,1,3,C
7,j,f,d,t,u,2,6,3,1,3,5,C
8,a,d,c,b,a,2,3,7,1,3,9,C
9,a,b,c,e,d,2,3,2,1,3,8,C


In [7]:
print(data.columns.tolist()[:10])
print(data.columns[:10])

['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10']
Index(['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10'], dtype='object')


# 1.3 Traitement des données

In [8]:
# On récupère juste les questions car nous allons recalculer les scores et les interprétations après.
data = data.iloc[:, :10]

val_rand = ['1', '2', '3']
for headers in list(data):
    for i in range(0, len(data[headers])):
        # On transforme les a, b, c en 1, 2, 3.
        if data[headers][i] == 'a' or data[headers][i] == 'A':
            data[headers][i] = '1'
        elif data[headers][i] == 'b' or data[headers][i] == 'B':
            data[headers][i] = '2'
        elif data[headers][i] == 'c' or data[headers][i] == 'C':
            data[headers][i] = '3'
        
        # On enlève les nan.
        # On transforme les erreurs de saisie et les nan en 2 (0 point) ou 1, 2 ou 3 (nombre de points aléatoires).
        # Commentez la méthode que vous ne voulez pas.
        elif data[headers][i] not in ['1', '2', '3']:
        
            # Méthode 1: 0 points
            #data[headers][i] = '2'
            
            # Méthode 2: Nombre de points aléatoires
            data[headers][i] = val_rand[randint(0,2)]


print(data.head(10))

  Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10
0  1  1  1  1  1  1  1  1  1   1
1  2  2  2  2  2  2  2  2  2   2
2  3  3  3  3  3  3  3  3  3   3
3  1  2  3  1  2  1  2  3  1   2
4  2  3  1  3  1  3  2  3  1   2
5  3  2  1  3  2  2  3  1  3   2
6  1  2  1  1  3  1  3  2  3   1
7  1  1  1  2  1  2  2  3  1   3
8  1  3  3  2  1  2  3  3  1   3
9  1  2  3  2  2  2  3  2  1   3


In [9]:
# On change les types de données.
for headers in list(data):
    data[headers] = data[headers].astype(str).astype(int)

print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Q1      235 non-null    int32
 1   Q2      235 non-null    int32
 2   Q3      235 non-null    int32
 3   Q4      235 non-null    int32
 4   Q5      235 non-null    int32
 5   Q6      235 non-null    int32
 6   Q7      235 non-null    int32
 7   Q8      235 non-null    int32
 8   Q9      235 non-null    int32
 9   Q10     235 non-null    int32
dtypes: int32(10)
memory usage: 9.3 KB
None


In [10]:
# On calcule le nouveau score.
score_val = {1: 1, 2: 0, 3: 2}

scores = []
for i in range(0, len(data['Q1'])):
    somme = 0
    for elt in list(data.iloc[i, :10]):
        somme += score_val[elt]
    
    scores.append(somme)
#     scores.append(sum([score_val[elt] for elt in list(data.iloc[i, :10])]))

data['Score'] = scores
print(data.info())
print(data.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Q1      235 non-null    int32
 1   Q2      235 non-null    int32
 2   Q3      235 non-null    int32
 3   Q4      235 non-null    int32
 4   Q5      235 non-null    int32
 5   Q6      235 non-null    int32
 6   Q7      235 non-null    int32
 7   Q8      235 non-null    int32
 8   Q9      235 non-null    int32
 9   Q10     235 non-null    int32
 10  Score   235 non-null    int64
dtypes: int32(10), int64(1)
memory usage: 11.1 KB
None
   Q1  Q2  Q3  Q4  Q5  Q6  Q7  Q8  Q9  Q10  Score
0   1   1   1   1   1   1   1   1   1    1     10
1   2   2   2   2   2   2   2   2   2    2      0
2   3   3   3   3   3   3   3   3   3    3     20
3   1   2   3   1   2   1   2   3   1    2      8
4   2   3   1   3   1   3   2   3   1    2     11
5   3   2   1   3   2   2   3   1   3    2     10
6   1   2   1   1   3   1   3   2  

In [11]:
# score < 10 => C
# score < 20 => B
# score < 30 => A
interpretations = []
for i in data["Score"]:    
    if i < 10:
        interpretation = 'C'
    elif i < 20:
        interpretation = 'B'
    elif i < 30:
        interpretation = 'A'
        
    interpretations.append(interpretation)
data["Interpretation"] = interpretations
data.head(10)
        

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Score,Interpretation
0,1,1,1,1,1,1,1,1,1,1,10,B
1,2,2,2,2,2,2,2,2,2,2,0,C
2,3,3,3,3,3,3,3,3,3,3,20,A
3,1,2,3,1,2,1,2,3,1,2,8,C
4,2,3,1,3,1,3,2,3,1,2,11,B
5,3,2,1,3,2,2,3,1,3,2,10,B
6,1,2,1,1,3,1,3,2,3,1,11,B
7,1,1,1,2,1,2,2,3,1,3,9,C
8,1,3,3,2,1,2,3,3,1,3,13,B
9,1,2,3,2,2,2,3,2,1,3,8,C


In [12]:
# On ajoute la nouvelle interprétation.

# Méthode 1.
# interpretation = []
# for i in range(0, len(data["Score"])):
#    if data["Score"][i] < 10:
#        interpretation.append('C')
#    elif data["Score"][i] < 20:
#        interpretation.append('B')
#    elif data["Score"][i] < 30:
#        interpretation.append('A')
# data['Interpretation'] = interpretation

# Méthode 2.
interpretation = []
for i in data["Score"]:
#     print(ord('C'), '-' ,i // 10, '=', ord('C') - i // 10, 'ou', chr(ord('C') - i // 10))
    interpretation.append(chr(ord('C') - i // 10))
data['Interpretation'] = interpretation


# Méthode 3.
# data['Interpretation'] = [chr(ord('C') - (s // 10)) for s in data["Score"]]


print(data.head(10))

   Q1  Q2  Q3  Q4  Q5  Q6  Q7  Q8  Q9  Q10  Score Interpretation
0   1   1   1   1   1   1   1   1   1    1     10              B
1   2   2   2   2   2   2   2   2   2    2      0              C
2   3   3   3   3   3   3   3   3   3    3     20              A
3   1   2   3   1   2   1   2   3   1    2      8              C
4   2   3   1   3   1   3   2   3   1    2     11              B
5   3   2   1   3   2   2   3   1   3    2     10              B
6   1   2   1   1   3   1   3   2   3    1     11              B
7   1   1   1   2   1   2   2   3   1    3      9              C
8   1   3   3   2   1   2   3   3   1    3     13              B
9   1   2   3   2   2   2   3   2   1    3      8              C


In [13]:
# Séparation du jeu de train et de test.
train_data = data.sample(int(len(data["Q1"]) * 0.7))
test_data = data.sample(int(len(data["Q1"]) * 0.3))

print(train_data)
print(test_data)

     Q1  Q2  Q3  Q4  Q5  Q6  Q7  Q8  Q9  Q10  Score Interpretation
203   2   2   2   2   1   1   2   3   3    2      6              C
87    3   2   3   3   3   3   1   3   3    3     17              B
35    1   3   3   1   1   2   3   1   3    3     14              B
91    1   1   2   3   2   3   1   1   2    3     10              B
149   2   2   2   2   2   2   2   2   2    2      0              C
..   ..  ..  ..  ..  ..  ..  ..  ..  ..  ...    ...            ...
92    1   2   2   3   2   1   1   2   2    1      6              C
197   3   3   3   1   1   1   1   2   1    3     13              B
147   1   1   2   3   2   1   2   1   1    1      8              C
33    3   2   1   2   1   2   3   2   1    3      9              C
42    2   2   3   3   2   1   3   1   3    1     11              B

[164 rows x 12 columns]
     Q1  Q2  Q3  Q4  Q5  Q6  Q7  Q8  Q9  Q10  Score Interpretation
15    2   2   1   1   2   2   1   1   2    3      6              C
65    2   3   1   3   1   2   2   3  

# 2 Développement et entraînement d'un modèle KNN from Scratch

In [14]:
from math import sqrt
from scipy.spatial import distance as distance_scipy

# voir: https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html

def distance(tab1, tab2, metrics='Euclidean'):
    return distance_scipy.cdist(tab1, tab2, metrics)


# 3 KNN Sklearn

In [16]:
X = data.drop(['Interpretation', 'Score'], axis=1)
y = data['Interpretation']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [19]:
y_pred = classifier.predict(X_test)
y_pred

array(['B', 'C', 'C', 'B', 'C', 'B', 'B', 'C', 'C', 'B', 'C', 'C', 'C',
       'C', 'C', 'C', 'B', 'C', 'C', 'C', 'B', 'B', 'C', 'C', 'C', 'C',
       'C', 'C', 'C', 'C', 'B', 'C', 'B', 'C', 'C', 'C', 'C', 'C', 'C',
       'B', 'C', 'C', 'C', 'C', 'C', 'B', 'C'], dtype=object)

In [20]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print('Précision du modèle : ', accuracy_score(y_test, y_pred))
print('\nMatrice de confusion : \n', confusion_matrix(y_test, y_pred))
print('\nReport : \n', classification_report(y_test, y_pred))

Précision du modèle :  0.5319148936170213

Matrice de confusion : 
 [[ 7 17]
 [ 5 18]]

Report : 
               precision    recall  f1-score   support

           B       0.58      0.29      0.39        24
           C       0.51      0.78      0.62        23

    accuracy                           0.53        47
   macro avg       0.55      0.54      0.50        47
weighted avg       0.55      0.53      0.50        47

