# **IMPORTS**

In [397]:
import pickle
import matplotlib.pyplot as plt
import math
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, silhouette_score, davies_bouldin_score, calinski_harabasz_score

# **I - LOAD DATASETS**

In [398]:
columns = ["age","sex","cp","trestbps","chol","fbs","restecg","thalach","exang","oldpeak","slope","ca","thal","num"]

## Cleveland

In [399]:
df_cleveland = pd.read_csv('processed.cleveland.data')
df_cleveland.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [400]:
df_cleveland.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,num
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,4.0


## Switzerland

In [401]:
df_switzerland = pd.read_csv('processed.switzerland.data')
df_switzerland.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,32,1,1,95,0,?,0,127,0,.7,1,?,?,1
1,34,1,4,115,0,?,?,154,0,.2,1,?,?,1
2,35,1,4,?,0,?,0,130,1,?,?,?,7,3
3,36,1,4,110,0,?,0,125,1,1,2,?,6,1
4,38,0,4,105,0,?,0,166,0,2.8,1,?,?,2


In [402]:
df_switzerland.describe()

Unnamed: 0,age,sex,cp,chol,num
count,123.0,123.0,123.0,123.0,123.0
mean,55.317073,0.918699,3.699187,0.0,1.804878
std,9.032108,0.274414,0.688726,0.0,1.013503
min,32.0,0.0,1.0,0.0,0.0
25%,51.0,1.0,4.0,0.0,1.0
50%,56.0,1.0,4.0,0.0,2.0
75%,61.5,1.0,4.0,0.0,3.0
max,74.0,1.0,4.0,0.0,4.0


## Long beach va

In [403]:
df_va = pd.read_csv('processed.va.data')
df_va.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,4,140,260,0,1,112,1,3.0,2,?,?,2
1,44,1,4,130,209,0,1,127,0,0.0,?,?,?,0
2,60,1,4,132,218,0,1,140,1,1.5,3,?,?,2
3,55,1,4,142,228,0,1,149,1,2.5,1,?,?,1
4,66,1,3,110,213,1,2,99,1,1.3,2,?,?,0


In [404]:
df_va.describe()

Unnamed: 0,age,sex,cp,restecg,num
count,200.0,200.0,200.0,200.0,200.0
mean,59.35,0.97,3.505,0.735,1.52
std,7.811697,0.171015,0.795701,0.683455,1.219441
min,35.0,0.0,1.0,0.0,0.0
25%,55.0,1.0,3.0,0.0,0.0
50%,60.0,1.0,4.0,1.0,1.0
75%,64.0,1.0,4.0,1.0,3.0
max,77.0,1.0,4.0,2.0,4.0


## **Hungarian**

In [405]:
df_hungarian = pd.read_csv('reprocessed.hungarian.data', sep=" ")
df_hungarian.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,40.0,1.0,2.0,140.0,289.0,0.0,0.0,172.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
1,49.0,0.0,3.0,160.0,180.0,0.0,0.0,156.0,0.0,1.0,2.0,-9.0,-9.0,1.0
2,37.0,1.0,2.0,130.0,283.0,0.0,1.0,98.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
3,48.0,0.0,4.0,138.0,214.0,0.0,0.0,108.0,1.0,1.5,2.0,-9.0,-9.0,3.0
4,54.0,1.0,3.0,150.0,-9.0,0.0,0.0,122.0,0.0,0.0,-9.0,-9.0,-9.0,0.0


In [406]:
df_hungarian.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0
mean,47.826531,0.72449,2.982993,132.102041,230.520408,-0.176871,0.187075,138.62585,0.272109,0.586054,-5.146259,-8.846939,-7.605442,0.792517
std,7.811812,0.447533,0.965117,19.437564,95.414336,1.499491,0.707616,25.08408,0.711273,0.908648,5.221611,1.382623,4.333468,1.237006
min,28.0,0.0,1.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,0.0,-9.0,-9.0,-9.0,0.0
25%,42.0,0.0,2.0,120.0,198.0,0.0,0.0,122.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
50%,49.0,1.0,3.0,130.0,237.0,0.0,0.0,140.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
75%,54.0,1.0,4.0,140.0,277.0,0.0,0.0,155.0,1.0,1.0,2.0,-9.0,-9.0,1.0
max,66.0,1.0,4.0,200.0,603.0,1.0,2.0,190.0,1.0,5.0,3.0,9.0,7.0,4.0


## **Dataset final**

On réunit les quatre datasets pour en faire un grand : 

In [407]:
df = pd.concat([df_cleveland, df_hungarian, df_switzerland, df_va], axis=0, ignore_index=True)

In [408]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 921 entries, 0 to 920
Data columns (total 14 columns):
age         920 non-null float64
sex         920 non-null float64
cp          920 non-null float64
trestbps    920 non-null object
chol        920 non-null object
fbs         920 non-null object
restecg     920 non-null object
thalach     920 non-null object
exang       920 non-null object
oldpeak     920 non-null object
slope       920 non-null object
ca          920 non-null object
thal        920 non-null object
num         920 non-null float64
dtypes: float64(4), object(10)
memory usage: 100.9+ KB


On a désormais un dataFrame pandas de 921 lignes. pandas détecte aucune valeurs manquantes car celles-ci sont symbolisées par des "?"

# **II - TRAITEMENT DES DONNÉES**

On commence donc par remplacer les "?" par les valeurs nan de numpy : 

In [409]:
for column in columns :
    for k in range(len(df[column])) :
        if(df[column].iloc[k]=="?"):
            df[column].iloc[k] = np.nan

De plus pandas n'interprète pas toutes les valeurs comme float, on règle ça : 

In [410]:
df['trestbps'] = df['trestbps'].astype('float64')
df['chol'] = df['chol'].astype('float64')
df['fbs'] = df['fbs'].astype('float64')
df['restecg'] = df['restecg'].astype('float64')
df['thalach'] = df['thalach'].astype('float64')
df['exang'] = df['exang'].astype('float64')
df['oldpeak'] = df['slope'].astype('float64')
df['ca'] = df['ca'].astype('float64')
df['thal'] = df['thal'].astype('float64')
df['slope'] = df['slope'].astype('float64')

In [411]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 921 entries, 0 to 920
Data columns (total 14 columns):
age         920 non-null float64
sex         920 non-null float64
cp          920 non-null float64
trestbps    862 non-null float64
chol        913 non-null float64
fbs         838 non-null float64
restecg     919 non-null float64
thalach     866 non-null float64
exang       866 non-null float64
oldpeak     801 non-null float64
slope       801 non-null float64
ca          600 non-null float64
thal        700 non-null float64
num         920 non-null float64
dtypes: float64(14)
memory usage: 100.9 KB


## **a) Valeurs aberrantes**

* **thal**

In [412]:
indexs_thal = df[df['thal']<0].index 

In [413]:
for index in indexs_thal : 
    df['thal'].loc[index] = np.nan

* **ca**

In [414]:
indexs_ca = df[df['ca']<0].index

In [415]:
for index in indexs_ca : 
    df['ca'].loc[index] = np.nan

* **trestbps**

In [416]:
indexs_trestbps = df[df['trestbps']<0].index

In [417]:
for index in indexs_trestbps : 
    df['trestbps'].loc[index] = np.nan

* **chol**

In [418]:
indexs_chol = df[df['chol']<0].index

In [419]:
for index in indexs_chol : 
    df['chol'].loc[index] = np.nan

* **fbs**

In [420]:
indexs_fbs = df[(df['fbs'] != 1) & (df['fbs'] != 0)].index

In [421]:
for index in indexs_fbs : 
    df['fbs'].loc[index] = np.nan

* **thalach**

In [422]:
indexs_thalach = df[df['thalach']<0].index

In [423]:
for index in indexs_thalach : 
    df['thalach'].loc[index] = np.nan

* **exang**

In [424]:
indexs_exang = df[df['exang']<0].index
for index in indexs_exang : 
    df['exang'].loc[index] = np.nan

* **oldpeak**

In [425]:
indexs_oldpeak = df[df['oldpeak']==-9].index
for index in indexs_oldpeak : 
    df['oldpeak'].loc[index] = np.nan

* **slope**

In [426]:
indexs_slope = df[df['slope']==-9].index
for index in indexs_slope : 
    df['slope'].loc[index] = np.nan

In [427]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 921 entries, 0 to 920
Data columns (total 14 columns):
age         920 non-null float64
sex         920 non-null float64
cp          920 non-null float64
trestbps    861 non-null float64
chol        890 non-null float64
fbs         830 non-null float64
restecg     919 non-null float64
thalach     865 non-null float64
exang       865 non-null float64
oldpeak     611 non-null float64
slope       611 non-null float64
ca          310 non-null float64
thal        434 non-null float64
num         920 non-null float64
dtypes: float64(14)
memory usage: 100.9 KB


In [428]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,920.0,920.0,920.0,861.0,890.0,830.0,919.0,865.0,865.0,611.0,611.0,310.0,434.0,920.0
mean,53.51087,0.78913,3.25,132.132404,199.130337,0.166265,0.594124,137.545665,0.389595,1.770867,1.770867,0.703226,5.087558,1.133696
std,9.424685,0.408148,0.930969,19.06607,110.78081,0.372543,0.865464,25.926276,0.487941,0.619256,0.619256,1.046951,1.919075,1.258942
min,28.0,0.0,1.0,0.0,0.0,0.0,-9.0,60.0,0.0,1.0,1.0,0.0,3.0,0.0
25%,47.0,1.0,3.0,120.0,175.0,0.0,0.0,120.0,0.0,1.0,1.0,0.0,3.0,0.0
50%,54.0,1.0,4.0,130.0,223.0,0.0,0.0,140.0,0.0,2.0,2.0,0.0,6.0,1.0
75%,60.0,1.0,4.0,140.0,268.0,0.0,1.0,157.0,1.0,2.0,2.0,1.0,7.0,2.0
max,77.0,1.0,4.0,200.0,603.0,1.0,2.0,202.0,1.0,3.0,3.0,9.0,7.0,4.0


En se renseignant on a vu que beaucoup des caractéristiques sont étroitement liées avec l'âge de manière médicale. On ajoute donc une colonne 'fork_age' attribuant une fourchette d'age de 10 ans aux patients : 

In [429]:
fourchette_age = list()
for k in range(0,len(df)) : 
    if((df['age'].loc[k] > 20) & (df['age'].loc[k]<=30)) : 
        fourchette_age.append(1)
    if((df['age'].loc[k] > 30) & (df['age'].loc[k]<=40)) : 
        fourchette_age.append(2)
    if((df['age'].loc[k] > 40) & (df['age'].loc[k]<=50)) : 
        fourchette_age.append(3)
    if((df['age'].loc[k] > 50) & (df['age'].loc[k]<=60)) : 
        fourchette_age.append(4)
    if((df['age'].loc[k] > 60) & (df['age'].loc[k]<=70)) : 
        fourchette_age.append(5)
    if((df['age'].loc[k] > 70) & (df['age'].loc[k]<=80)) : 
        fourchette_age.append(6)

In [430]:
df['fork_age'] = pd.Series(fourchette_age)

In [431]:
m1, m2, m3, m4, m5, m6 = df['fork_age'] == 1, df['fork_age'] == 2, df['fork_age'] == 3, df['fork_age'] == 4, df['fork_age'] == 5, df['fork_age'] == 6
n1, n2, n3, n4, n5, n6 = (df['fork_age'] == 1) & (df['sex'] == 1) , (df['fork_age'] == 2) & (df['sex'] == 1), (df['fork_age'] == 3) & (df['sex'] == 1), (df['fork_age'] == 4) & (df['sex'] == 1), (df['fork_age'] == 5) & (df['sex'] == 1), (df['fork_age'] == 6) & (df['sex'] == 1)
f1, f2, f3, f4, f5, f6 = (df['fork_age'] == 1) & (df['sex'] == 0) , (df['fork_age'] == 2) & (df['sex'] == 0), (df['fork_age'] == 3) & (df['sex'] == 0), (df['fork_age'] == 4) & (df['sex'] == 0), (df['fork_age'] == 5) & (df['sex'] == 0), (df['fork_age'] == 6) & (df['sex'] == 0)

On observe que les colonnes ayant le plus de valeurs manquantes sont "ca"(320) et "thal"(220). Commençons donc par ces deux colonnes : 

* **âge**

In [432]:
df = df.drop(df[df['age'].isna()].index)

* **ca : trop de valeurs manquantes, on a don décidé donc dans un premier temps de supprimer la colonne :** 

In [433]:
df = df.drop(columns = 'ca')

* **thal : c'est pas optimal mais on remplace par la médiane (on ne peut pas prédire car thal est une caractéristique héréditaire) par fourchette d'âge :**

In [434]:
df.loc[m1,'thal'] = df.loc[m1,'thal'].fillna(df.loc[m1,'thal'].median())
df.loc[m2,'thal'] = df.loc[m2,'thal'].fillna(df.loc[m2,'thal'].median())
df.loc[m3,'thal'] = df.loc[m3,'thal'].fillna(df.loc[m3,'thal'].median())
df.loc[m4,'thal'] = df.loc[m4,'thal'].fillna(df.loc[m4,'thal'].median())
df.loc[m5,'thal'] = df.loc[m5,'thal'].fillna(df.loc[m5,'thal'].median())
df.loc[m6,'thal'] = df.loc[m6,'thal'].fillna(df.loc[m6,'thal'].median())

* **trestbps : on remplace par la moyenne par fourchette d'âge :** 

In [435]:
df.loc[m1,'trestbps'] = df.loc[m1,'trestbps'].fillna(df.loc[m1,'trestbps'].mean())
df.loc[m2,'trestbps'] = df.loc[m2,'trestbps'].fillna(df.loc[m2,'trestbps'].mean())
df.loc[m3,'trestbps'] = df.loc[m3,'trestbps'].fillna(df.loc[m3,'trestbps'].mean())
df.loc[m4,'trestbps'] = df.loc[m4,'trestbps'].fillna(df.loc[m4,'trestbps'].mean())
df.loc[m5,'trestbps'] = df.loc[m5,'trestbps'].fillna(df.loc[m5,'trestbps'].mean())
df.loc[m6,'trestbps'] = df.loc[m6,'trestbps'].fillna(df.loc[m6,'trestbps'].mean())

* **chol**

In [436]:
df.loc[n1,'chol'] = df.loc[n1,'chol'].fillna(df.loc[n1,'chol'].mean())
df.loc[n2,'chol'] = df.loc[n2,'chol'].fillna(df.loc[n2,'chol'].mean())
df.loc[n3,'chol'] = df.loc[n3,'chol'].fillna(df.loc[n3,'chol'].mean())
df.loc[n4,'chol'] = df.loc[n4,'chol'].fillna(df.loc[n4,'chol'].mean())
df.loc[n5,'chol'] = df.loc[n5,'chol'].fillna(df.loc[n5,'chol'].mean())
df.loc[n6,'chol'] = df.loc[n6,'chol'].fillna(df.loc[n6,'chol'].mean())

In [437]:
df.loc[f1,'chol'] = df.loc[f1,'chol'].fillna(df.loc[f1,'chol'].mean())
df.loc[f2,'chol'] = df.loc[f2,'chol'].fillna(df.loc[f2,'chol'].mean())
df.loc[f3,'chol'] = df.loc[f3,'chol'].fillna(df.loc[f3,'chol'].mean())
df.loc[f4,'chol'] = df.loc[f4,'chol'].fillna(df.loc[f4,'chol'].mean())
df.loc[f5,'chol'] = df.loc[f5,'chol'].fillna(df.loc[f5,'chol'].mean())
df.loc[f6,'chol'] = df.loc[f6,'chol'].fillna(df.loc[f6,'chol'].mean())

* **fbs**

In [438]:
df.loc[m1,'fbs'] = df.loc[m1,'fbs'].fillna(df.loc[m1,'fbs'].median())
df.loc[m2,'fbs'] = df.loc[m2,'fbs'].fillna(df.loc[m2,'fbs'].median())
df.loc[m3,'fbs'] = df.loc[m3,'fbs'].fillna(df.loc[m3,'fbs'].median())
df.loc[m4,'fbs'] = df.loc[m4,'fbs'].fillna(df.loc[m4,'fbs'].median())
df.loc[m5,'fbs'] = df.loc[m5,'fbs'].fillna(df.loc[m5,'fbs'].median())
df.loc[m6,'fbs'] = df.loc[m6,'fbs'].fillna(df.loc[m6,'fbs'].median())

* **restecg**

In [439]:
df = df.drop(df[df['restecg'].isna()].index)

* **thalach**

In [440]:
indexs = df[df['thalach'].isna()].index
for index in indexs : 
    if (df['sex'].loc[index] == 1) :
        df["thalach"].loc[index] = 220 - df['age'].loc[index]
    else : 
        df['thalach'].loc[index] == 226 - df['age'].loc[index] 

In [441]:
df['thalach'].loc[772] = 226 - df['age'].loc[772]
df['thalach'].loc[592] = 226 - df['age'].loc[592]

* **exang**

In [442]:
df.loc[m1,'exang'] = df.loc[m1,'exang'].fillna(df.loc[m1,'exang'].median())
df.loc[m2,'exang'] = df.loc[m2,'exang'].fillna(df.loc[m2,'exang'].median())
df.loc[m3,'exang'] = df.loc[m3,'exang'].fillna(df.loc[m3,'exang'].median())
df.loc[m4,'exang'] = df.loc[m4,'exang'].fillna(df.loc[m4,'exang'].median())
df.loc[m5,'exang'] = df.loc[m5,'exang'].fillna(df.loc[m5,'exang'].median())
df.loc[m6,'exang'] = df.loc[m6,'exang'].fillna(df.loc[m6,'exang'].median())

* **oldpeak**

In [443]:
df.loc[m1,'oldpeak'] = df.loc[m1,'oldpeak'].fillna(df.loc[m1,'oldpeak'].median())
df.loc[m2,'oldpeak'] = df.loc[m2,'oldpeak'].fillna(df.loc[m2,'oldpeak'].median())
df.loc[m3,'oldpeak'] = df.loc[m3,'oldpeak'].fillna(df.loc[m3,'oldpeak'].median())
df.loc[m4,'oldpeak'] = df.loc[m4,'oldpeak'].fillna(df.loc[m4,'oldpeak'].median())
df.loc[m5,'oldpeak'] = df.loc[m5,'oldpeak'].fillna(df.loc[m5,'oldpeak'].median())
df.loc[m6,'oldpeak'] = df.loc[m6,'oldpeak'].fillna(df.loc[m6,'oldpeak'].median())

* **slope**

In [444]:
df.loc[m1,'slope'] = df.loc[m1,'slope'].fillna(df.loc[m1,'slope'].median())
df.loc[m2,'slope'] = df.loc[m2,'slope'].fillna(df.loc[m2,'slope'].median())
df.loc[m3,'slope'] = df.loc[m3,'slope'].fillna(df.loc[m3,'slope'].median())
df.loc[m4,'slope'] = df.loc[m4,'slope'].fillna(df.loc[m4,'slope'].median())
df.loc[m5,'slope'] = df.loc[m5,'slope'].fillna(df.loc[m5,'slope'].median())
df.loc[m6,'slope'] = df.loc[m6,'slope'].fillna(df.loc[m6,'slope'].median())

* **le reste**

In [445]:
df = df.drop(df[df['fork_age'].isna()].index)

In [446]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 918 entries, 0 to 919
Data columns (total 14 columns):
age         918 non-null float64
sex         918 non-null float64
cp          918 non-null float64
trestbps    918 non-null float64
chol        918 non-null float64
fbs         918 non-null float64
restecg     918 non-null float64
thalach     918 non-null float64
exang       918 non-null float64
oldpeak     918 non-null float64
slope       918 non-null float64
thal        918 non-null float64
num         918 non-null float64
fork_age    918 non-null float64
dtypes: float64(14)
memory usage: 107.6 KB


In [447]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,thal,num,fork_age
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.522876,0.788671,3.250545,132.289273,199.355653,0.150327,0.592593,138.969499,0.372549,1.844227,1.844227,5.146514,1.133987,3.819172
std,9.408749,0.408474,0.930739,18.471591,108.965735,0.357586,0.864689,25.789711,0.483747,0.518706,0.518706,1.875935,1.260299,0.977379
min,28.0,0.0,1.0,0.0,0.0,0.0,-9.0,60.0,0.0,1.0,1.0,3.0,0.0,1.0
25%,47.0,1.0,3.0,120.0,177.0,0.0,0.0,120.0,0.0,2.0,2.0,3.0,0.0,3.0
50%,54.0,1.0,4.0,130.0,222.0,0.0,0.0,140.0,0.0,2.0,2.0,6.0,1.0,4.0
75%,60.0,1.0,4.0,140.0,267.0,0.0,1.0,160.0,1.0,2.0,2.0,7.0,2.0,4.0
max,77.0,1.0,4.0,200.0,603.0,1.0,2.0,202.0,1.0,3.0,3.0,7.0,4.0,6.0


In [448]:
df['num'] = df.num.map({0: 0, 1: 1, 2: 1, 3: 1, 4: 1})

In [449]:
with open('df_complete.pickle', 'wb') as f:
    pickle.dump(df, f)