### importing the necessary libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import seaborn as sns
import os
sns.set_style(style="darkgrid")

### Getting path to dataset

In [4]:
current=os.getcwd()
root=os.path.dirname(current)

In [5]:
print(os.listdir(root))

['.git', '.gitattributes', '.virtual_documents', 'data', 'LICENSE', 'notebooks', 'source documents']


### getting full path to dataset

In [7]:
path=os.path.join(root,'data','data_science_competition_2024.csv')

### reading data

In [9]:
df=pd.read_csv(path)

### checking entries in data

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,loan_id,gender,disbursemet_date,currency,country,sex,is_employed,job,location,...,number_of_defaults,outstanding_balance,interest_rate,age,number_of_defaults.1,remaining term,salary,marital_status,age.1,Loan Status
0,0,8d05de78-ff32-46b1-aeb5-b3190f9c158a,female,2022 10 29,USD,Zimbabwe,female,True,Teacher,Beitbridge,...,0,48653.011473,0.22,37,0,47,3230.038869,married,37,Did not default
1,1,368bf756-fcf2-4822-9612-f445d90b485b,other,2020 06 06,USD,Zimbabwe,other,True,Teacher,Harare,...,2,28752.062237,0.2,43,2,62,3194.139103,single,43,Did not default
2,2,6e3be39e-49b5-45b5-aab6-c6556de53c6f,other,2023 09 29,USD,Zimbabwe,other,True,Nurse,Gweru,...,1,44797.554126,0.22,43,1,57,3330.826656,married,43,Did not default
3,3,191c62f8-2211-49fe-ba91-43556b307871,female,2022 06 22,USD,Zimbabwe,female,True,Doctor,Rusape,...,0,35681.496413,0.23,47,0,42,2246.79702,divorced,47,Did not default
4,4,477cd8a1-3b01-4623-9318-8cd6122a8346,male,2023 02 08,USD,Zimbabwe,male,True,Nurse,Chipinge,...,0,34156.055882,0.2,42,0,45,2310.858441,married,42,Did not default


## __Data Cleaning__

### `Unnamed` feature is unnecessary as it only indicates the index of a row in the dataset

### getting rid of the column

In [15]:
df=pd.read_csv(path,index_col=0)

In [16]:
df.head()

Unnamed: 0,loan_id,gender,disbursemet_date,currency,country,sex,is_employed,job,location,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,number_of_defaults.1,remaining term,salary,marital_status,age.1,Loan Status
0,8d05de78-ff32-46b1-aeb5-b3190f9c158a,female,2022 10 29,USD,Zimbabwe,female,True,Teacher,Beitbridge,39000.0,0,48653.011473,0.22,37,0,47,3230.038869,married,37,Did not default
1,368bf756-fcf2-4822-9612-f445d90b485b,other,2020 06 06,USD,Zimbabwe,other,True,Teacher,Harare,27000.0,2,28752.062237,0.2,43,2,62,3194.139103,single,43,Did not default
2,6e3be39e-49b5-45b5-aab6-c6556de53c6f,other,2023 09 29,USD,Zimbabwe,other,True,Nurse,Gweru,35000.0,1,44797.554126,0.22,43,1,57,3330.826656,married,43,Did not default
3,191c62f8-2211-49fe-ba91-43556b307871,female,2022 06 22,USD,Zimbabwe,female,True,Doctor,Rusape,24000.0,0,35681.496413,0.23,47,0,42,2246.79702,divorced,47,Did not default
4,477cd8a1-3b01-4623-9318-8cd6122a8346,male,2023 02 08,USD,Zimbabwe,male,True,Nurse,Chipinge,19000.0,0,34156.055882,0.2,42,0,45,2310.858441,married,42,Did not default


### checking for number of rows and columns

In [18]:
rows,columns=df.shape

In [19]:
print("rows:{}\ncolumns:{}".format(rows,columns))

rows:100000
columns:20


### checking for missing values

In [21]:
df.isnull().sum()

loan_id                    0
gender                     0
disbursemet_date           0
currency                   0
country                  100
sex                        0
is_employed                0
job                     4136
location                 595
loan_amount                0
number_of_defaults         0
outstanding_balance        0
interest_rate              0
age                        0
number_of_defaults.1       0
remaining term             0
salary                     0
marital_status             0
age.1                      0
Loan Status                0
dtype: int64

### further inspecting null columns

In [23]:
nulls=df.isnull().sum()

In [24]:
nulls=nulls[nulls>0]

In [25]:
nulls

country      100
job         4136
location     595
dtype: int64

### getting feature_names for null columns

In [27]:
null_columns=nulls.index.tolist()
null_columns

['country', 'job', 'location']

In [28]:
subset=df[null_columns]
subset.head()

Unnamed: 0,country,job,location
0,Zimbabwe,Teacher,Beitbridge
1,Zimbabwe,Teacher,Harare
2,Zimbabwe,Nurse,Gweru
3,Zimbabwe,Doctor,Rusape
4,Zimbabwe,Nurse,Chipinge


### cleaning country first

In [30]:
subset.country.value_counts()

country
Zimbabwe    99787
zimbabwe      100
Zim            13
Name: count, dtype: int64

### it seems that the only value under country is Zimbabwe so will fill every missing value with `Zimbabwe`

### But first:

### __making the country codes uniform__

__converting `zimbabwe` to `Zimbabwe`__

In [35]:
subset['country']=subset.country.str.title()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['country']=subset.country.str.title()


__viewing changes__

In [37]:
subset.country.value_counts()

country
Zimbabwe    99887
Zim            13
Name: count, dtype: int64

__now converting `Zim` to `Zimbabwe`__

In [39]:
subset['country']=subset.country.apply(lambda x:'Zimbabwe' if x=='Zim' else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['country']=subset.country.apply(lambda x:'Zimbabwe' if x=='Zim' else x)


__viewing changes__

In [41]:
subset.country.value_counts()

country
Zimbabwe    99900
Name: count, dtype: int64

In [42]:
rows

100000

__now that all values for country are uniform,__

__filling missing values with `Zimbabwe`__

In [44]:
subset['country']=subset.country.fillna('Zimbabwe')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['country']=subset.country.fillna('Zimbabwe')


In [45]:
df['country']=subset.country

__getting rid of imputed feature in subset__

In [47]:
subset=subset.drop('country',axis=1)

In [48]:
subset.head()

Unnamed: 0,job,location
0,Teacher,Beitbridge
1,Teacher,Harare
2,Nurse,Gweru
3,Doctor,Rusape
4,Nurse,Chipinge


__inspecting the 2 remaining null features__

In [50]:
subset.describe().transpose()

Unnamed: 0,count,unique,top,freq
job,95864,11,Engineer,16524
location,99405,157,Harare,8338


In [51]:
subset.job.value_counts()

job
Engineer              16524
Nurse                 15284
Data Analyst          13204
Doctor                12186
Software Developer    11932
Teacher                8950
Accountant             7802
SoftwareDeveloper      3564
Data Scientist         3521
Lawyer                 2862
Data Scintist            35
Name: count, dtype: int64

In [52]:
subset.location.value_counts()

location
Harare               8338
Bulawayo             8078
Mutare               8062
Gweru                7803
Masvingo             7476
                     ... 
 Redcliff               1
Victoria Falls          1
Victoria Falls          1
   Gokwe                1
Gokwe                   1
Name: count, Length: 157, dtype: int64

### imputing missing values in original dataset

In [54]:
missing = subset.columns.tolist()

In [62]:
# creatin encoders
enc1=LabelEncoder()
enc2=LabelEncoder()
# fitting encoders
enc1.fit(df[missing[0]])
enc2.fit(df[missing[1]])
# transforming features to numeric
df[missing[0]]=enc1.transform(df[missing[0]])
df[missing[1]]=enc2.transform(df[missing[1]])

In [64]:
imputer = KNNImputer(n_neighbors=5, weights='uniform')
df[missing] = imputer.fit_transform(df[missing])