### Importing Libraries 

In [23]:
#Declare all libraries and their uses here

import pandas as pd # dataframe analysis, creation of dataframe, label encoding
# import matplotlib.pyplot as plt # data visualisation
# import seaborn as sns # data visualisation

# import numpy as np 
# import statmodels.api as sm

from sklearn import preprocessing

### Loading Dataset

In [41]:
df = pd.read_csv('train.csv')

### Studying the data

Viewing data types and non-null counts of each column of the data

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28322 entries, 0 to 28321
Data columns (total 36 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      28322 non-null  object 
 1   gender  28322 non-null  object 
 2   s11     28322 non-null  object 
 3   s12     28322 non-null  object 
 4   s13     28322 non-null  int64  
 5   s16     28322 non-null  object 
 6   s17     28322 non-null  object 
 7   s18     28322 non-null  object 
 8   s48     28322 non-null  int64  
 9   s52     28322 non-null  object 
 10  s53     28322 non-null  object 
 11  s54     2628 non-null   object 
 12  s55     3206 non-null   object 
 13  s56     0 non-null      float64
 14  s57     0 non-null      float64
 15  s58     28322 non-null  object 
 16  s59     0 non-null      float64
 17  s69     28322 non-null  object 
 18  s70     28322 non-null  object 
 19  s71     28322 non-null  object 
 20  n1      28322 non-null  float64
 21  n2      28322 non-null  float64
 22

Viewing all the counts of each unique value in each column

In [26]:
column_names = df.columns

for column in column_names:
    print(df[column].value_counts())
    print("\n************\n")

b'gAAAAABinOicS09vrmgh0_JyEHihI13ptO0rCyHP7l76be71PWA2ReUc4HUQn16Fya1z8_VStNnFGaXJF262CgsuMPzOaknSeg=='    1
b'gAAAAABinOiekMHaC6-03yzvmLFVBfqljUIev5XFrbbJarEbo-mshNj5pWAXfTdmvEMQtb0WjtlboHt2rFBrCoipjAn9sOOZEg=='    1
b'gAAAAABinOiYtx-HpOZBYTtT-FndzIj6Nt2HTKu7UKAs-Dfxd8mxrvwxRNGE47Si_-kc5TiwVkTf59u94658aAU7gHD9-TGBMQ=='    1
b'gAAAAABinOia961WRLsDIRdEcGr5_RSKfmTjQ2ME5HBpIUtBdjJUeCTgVI6uzDIdnDRB58VBUvirHgdTdjgECltfpZ2XequVmA=='    1
b'gAAAAABinOibb7LcG8T4rQSnzf-b2GjK5D0F1ZLe6VMES-x90Pi5-Emk_dbp7xYXuepQpmVa_rhfEsemOUZUGn9_30mt-Vr8iQ=='    1
                                                                                                          ..
b'gAAAAABinOiawRN0qVxqLZn-_t1Qb87hktzzZnsBU00aUINCq1S48jnwhwxdn5XNd6mR189SFIILrWFbz0_LyBAvIlgUCImvqw=='    1
b'gAAAAABinOiZPy87pVtsRScbROxYGDhSan3Pp9q53S8zvzjZWJIhBd_PGqGRWUhxCwbTpmkda93liZMesGSchyhojqcCkJUD2w=='    1
b'gAAAAABinOib5XtfV1CgG7sJ3lqWVyVEX63543B09-SgMzYFlBkQuLmoEYkpfdNT1_v2JR0feyWOaSlNB_HLd2WSNncMF_Pu8Q=='    1
b'gAAAAABinOiZOg76i

Identify any columns having null values

In [27]:
null_df = df.isnull().sum().to_frame()
null_df.loc[null_df[0]!=0]

Unnamed: 0,0
s54,25694
s55,25116
s56,28322
s57,28322
s59,28322


### Identify and drop columns mostly null values 
As columns having more than 70% missing values should have no significant effect on the target column

In [28]:
majority_null = ['s59','s57','s56','s55','s54']
df.drop(majority_null, axis = 1, inplace= True)

### Encoding

Label encoding for binary categorical data. (Only two options eg: M/F, Y/N). Identified from column value counts. Done using scikit.learn library.

In [29]:
# Creating instance of label encoder from scikit.learn
label_encoder = preprocessing.LabelEncoder()

# columns to encode
label_encode = ['gender','s11','s12','s53','s58']

# use encoder on each of the selected columns
for column in label_encode:
    df[column] = label_encoder.fit_transform(df[column])  

Use one-hot encoding instead of label encoding for non-binary categories, as it is unclear whether data is ordinal or not, to avoid unintentional ranking and therfore bias. Done using get_dummies from pandas library

In [30]:
# columns to encode
one_hot_encode = ['s16','s17','s18','s69','s70','s71']

# create dummy columns for each possible value 
# in each of the columns 
# (auto prefixed with original column name)
ohe_df = pd.get_dummies(df[one_hot_encode])

# Drop each of the original columns in the original dataframe 
# as they are now encoded
df = df.drop(columns=one_hot_encode, axis = 1)

# Join the dataframe with the encoded columns 
# to the original dataframe
df = df.join(ohe_df)

In [1]:
replacement_mapping_dict = {
    "l": "1",
    "o": "0"
}
df['s52'] = df['s52'].apply(lambda x: x.replace(replacement_mapping_dict, regex=True))

NameError: name 'df' is not defined

In [45]:
# you can do multiple replacements in within one call of the replace method by creating a mapping dictionary
# very scalable woo

df['s52'].value_counts()

1    27032
0     1290
Name: s52, dtype: int64

### Look at the encoded categorical data

In [None]:
column_names = df.columns

for column in column_names:
    print(df[column].value_counts())
    print("\n************\n")