### Chargement Packages

In [1]:
# Chargement package
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

### Fonctions

In [60]:
def recodage(dataset):
  # Suppression des colonnes 
  dataset = dataset.drop(columns=['partner','dec_o', 'order', 'positin1', 'position', 'round', 'wave','field', 'career', 'expnum', 'from', 'zipcode','idg','iid','id','pid'])

   # Liste des colonnes à transtyper en numeric
  var_tofloat = ['int_corr','income','pf_o_att',	'pf_o_sin',	'pf_o_int',	'pf_o_fun',	'pf_o_amb',	'pf_o_sha', 'attr1_1', 'sinc1_1',	'intel1_1',	'fun1_1',	'amb1_1',	'shar1_1']

  for i in var_tofloat:
    dataset[i] = dataset[i].str.replace(",","")
    dataset[i] = pd.to_numeric(dataset[i], downcast="float")
  # Liste des colonnes à transtyper en object
  var_toobject = ['iid_pid','gender', 'condtn','match','samerace','race_o','field_cd', 'race', 'goal','date', 'go_out', 'career_c']

  for i in var_toobject:
    dataset[i] = dataset[i].astype("object")


  colnames_categ = dataset.select_dtypes(include = 'object').columns.tolist()
  colnames_float = dataset.select_dtypes(exclude = 'object').columns.tolist()
  joined_colnames = colnames_categ + colnames_float

  dataset = dataset.reindex(columns=joined_colnames)

  df_res = dataset[[c for c in dataset if c not in ['match']] 
       + ['match']]

  return df_res

In [None]:
def recodage_withdec_o(dataset):
   # Liste des colonnes à transtyper en numeric
  var_tofloat = ['int_corr','income','pf_o_att',	'pf_o_sin',	'pf_o_int',	'pf_o_fun',	'pf_o_amb',	'pf_o_sha', 'attr1_1', 'sinc1_1',	'intel1_1',	'fun1_1',	'amb1_1',	'shar1_1']

  for i in var_tofloat:
    dataset[i] = dataset[i].str.replace(",","")
    dataset[i] = pd.to_numeric(dataset[i], downcast="float")
  # Liste des colonnes à transtyper en object
  var_toobject = ['gender','idg', 'condtn',	'wave', 'position',	'positin1','match','samerace','race_o','dec_o','field_cd', 'race', 'goal','date', 'go_out', 'career_c']

  for i in var_toobject:
    dataset[i] = dataset[i].astype("object")

  # Remove field career , codage de field et career
  df_res = dataset.drop(columns=['field', 'career', 'order', 'positin1', 'position', 'round'])

  return df_res

In [75]:
def remove_na(dataset):

  # Séparation entre plusieurs df pour traiter les NA
  
  # Quanti
  df_quanti = dataset.select_dtypes(exclude='object').copy()
  imputer = KNNImputer(n_neighbors=3, weights="distance")
  df_quanti = pd.DataFrame(imputer.fit_transform(df_quanti), columns = df_quanti.columns)

  #print(df_quanti.info())
  # Quali 
  df_quali = dataset.select_dtypes(include='object').copy()
  #print(df_quali.info())
  df_quali.iloc[:,1:11].fillna(99, inplace = True)

  df_target = dataset.match
  #print(df_quali.info())
  # Combinaison des deux datasets 
  df_clean = pd.concat([df_quali, df_quanti.reindex(df_quali.index)], axis=1)

  df_clean = df_clean[[c for c in df_clean if c not in ['match']] 
       + ['match']]
  print("Format : " )
  print(df_clean.shape)
  print("Nombre de NaN : ")
  print(df_clean.isna().sum().sum())
  return df_clean

In [45]:
def clean_data(dataset):
  data_reco = recodage(dataset)


  data_res = remove_na(data_reco)
  return data_res

In [None]:
def clean_data_withdec_o(dataset):
  data_reco = recodage_withdec_o(dataset)


  data_res = remove_na(data_reco)
  return data_res

### Chargement Dataset

In [3]:
# Connexion à ggdrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Import du fichier via ggdrive
file_path = "/content/drive/MyDrive/M2_SISE/Python_M2/train.csv" # Lien perso ggdrive
df_train = pd.read_csv(file_path, sep = ';')

### Application des formules 

In [81]:
df = df_train.copy()
df_nett = clean_data(df)

Format : 
(6804, 54)
Nombre de NaN : 
0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [None]:
df = df_train.copy()
df_nett_deco = clean_data_withdec_o(df)

Format : 
(6804, 64)
Nombre de NaN : 
0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [84]:
df_nett

Unnamed: 0,iid_pid,gender,condtn,samerace,race_o,field_cd,race,goal,date,go_out,...,shopping,yoga,exphappy,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,match
0,6804,1,2,0,2.0,1.0,4.0,2.0,6.0,5.0,...,4.0,1.0,6.0,30.0,30.0,30.0,5.0,0.0,5.0,0
1,6803,1,2,0,2.0,1.0,4.0,2.0,6.0,5.0,...,4.0,1.0,6.0,30.0,30.0,30.0,5.0,0.0,5.0,0
2,6802,1,2,0,2.0,1.0,4.0,2.0,6.0,5.0,...,4.0,1.0,6.0,30.0,30.0,30.0,5.0,0.0,5.0,0
3,6801,1,2,0,2.0,1.0,4.0,2.0,6.0,5.0,...,4.0,1.0,6.0,30.0,30.0,30.0,5.0,0.0,5.0,0
4,6800,1,2,0,1.0,1.0,4.0,2.0,6.0,5.0,...,4.0,1.0,6.0,30.0,30.0,30.0,5.0,0.0,5.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6799,5,0,1,0,3.0,1.0,4.0,2.0,7.0,1.0,...,8.0,1.0,3.0,15.0,20.0,20.0,15.0,15.0,15.0,1
6800,4,0,1,0,2.0,1.0,4.0,2.0,7.0,1.0,...,8.0,1.0,3.0,15.0,20.0,20.0,15.0,15.0,15.0,1
6801,3,0,1,1,4.0,1.0,4.0,2.0,7.0,1.0,...,8.0,1.0,3.0,15.0,20.0,20.0,15.0,15.0,15.0,1
6802,2,0,1,0,2.0,1.0,4.0,2.0,7.0,1.0,...,8.0,1.0,3.0,15.0,20.0,20.0,15.0,15.0,15.0,0


### Export

In [85]:
from google.colab import files
df_nett.to_csv('train_clean.csv') 
files.download('train_clean.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
df_nett_deco.to_csv('df_clean_deco.csv') 
files.download('df_clean_deco.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>