In [1]:
import os
import pandas as pd
import warnings
import seaborn as sns
sns.set_style(style="darkgrid")
warnings.filterwarnings("ignore")

### Getting path to dataset

In [2]:
current=os.getcwd()
root=os.path.dirname(current)

In [3]:
print(root)

C:\Users\t470p\Documents\GitHub\channels\New folder\Claxon-DataScience-hackathon


In [4]:
os.listdir(root)

['.git', '.gitattributes', 'data', 'LICENSE', 'notebooks', 'source documents']

In [5]:
path=os.path.join(root,'data','data_science_competition_2024.csv')

### reading data

In [6]:
df=pd.read_csv(path)

### checking entries in data

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,loan_id,gender,disbursemet_date,currency,country,sex,is_employed,job,location,...,number_of_defaults,outstanding_balance,interest_rate,age,number_of_defaults.1,remaining term,salary,marital_status,age.1,Loan Status
0,0,8d05de78-ff32-46b1-aeb5-b3190f9c158a,female,2022 10 29,USD,Zimbabwe,female,True,Teacher,Beitbridge,...,0,48653.011473,0.22,37,0,47,3230.038869,married,37,Did not default
1,1,368bf756-fcf2-4822-9612-f445d90b485b,other,2020 06 06,USD,Zimbabwe,other,True,Teacher,Harare,...,2,28752.062237,0.2,43,2,62,3194.139103,single,43,Did not default
2,2,6e3be39e-49b5-45b5-aab6-c6556de53c6f,other,2023 09 29,USD,Zimbabwe,other,True,Nurse,Gweru,...,1,44797.554126,0.22,43,1,57,3330.826656,married,43,Did not default
3,3,191c62f8-2211-49fe-ba91-43556b307871,female,2022 06 22,USD,Zimbabwe,female,True,Doctor,Rusape,...,0,35681.496413,0.23,47,0,42,2246.79702,divorced,47,Did not default
4,4,477cd8a1-3b01-4623-9318-8cd6122a8346,male,2023 02 08,USD,Zimbabwe,male,True,Nurse,Chipinge,...,0,34156.055882,0.2,42,0,45,2310.858441,married,42,Did not default


## __Data Cleaning__

### `Unnamed` feature is unnecessary as it only indicates the index of a row in the dataset

### getting rid of the column

In [8]:
df=pd.read_csv(path,index_col=0)

In [9]:
df.head()

Unnamed: 0,loan_id,gender,disbursemet_date,currency,country,sex,is_employed,job,location,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,number_of_defaults.1,remaining term,salary,marital_status,age.1,Loan Status
0,8d05de78-ff32-46b1-aeb5-b3190f9c158a,female,2022 10 29,USD,Zimbabwe,female,True,Teacher,Beitbridge,39000.0,0,48653.011473,0.22,37,0,47,3230.038869,married,37,Did not default
1,368bf756-fcf2-4822-9612-f445d90b485b,other,2020 06 06,USD,Zimbabwe,other,True,Teacher,Harare,27000.0,2,28752.062237,0.2,43,2,62,3194.139103,single,43,Did not default
2,6e3be39e-49b5-45b5-aab6-c6556de53c6f,other,2023 09 29,USD,Zimbabwe,other,True,Nurse,Gweru,35000.0,1,44797.554126,0.22,43,1,57,3330.826656,married,43,Did not default
3,191c62f8-2211-49fe-ba91-43556b307871,female,2022 06 22,USD,Zimbabwe,female,True,Doctor,Rusape,24000.0,0,35681.496413,0.23,47,0,42,2246.79702,divorced,47,Did not default
4,477cd8a1-3b01-4623-9318-8cd6122a8346,male,2023 02 08,USD,Zimbabwe,male,True,Nurse,Chipinge,19000.0,0,34156.055882,0.2,42,0,45,2310.858441,married,42,Did not default


### checking for number of rows and columns

In [10]:
rows,columns=df.shape

In [11]:
print("rows:{}\ncolumns:{}".format(rows,columns))

rows:100000
columns:20


### checking for missing values

In [12]:
df.isnull().sum()

loan_id                    0
gender                     0
disbursemet_date           0
currency                   0
country                  100
sex                        0
is_employed                0
job                     4136
location                 595
loan_amount                0
number_of_defaults         0
outstanding_balance        0
interest_rate              0
age                        0
number_of_defaults.1       0
remaining term             0
salary                     0
marital_status             0
age.1                      0
Loan Status                0
dtype: int64

### further inspecting null columns

In [13]:
nulls=df.isnull().sum()

In [14]:
nulls=nulls[nulls>0]

In [15]:
nulls

country      100
job         4136
location     595
dtype: int64

### getting feature_names for null columns

In [16]:
null_columns=nulls.index.tolist()
null_columns

['country', 'job', 'location']

In [17]:
subset=df[null_columns]
subset.head()

Unnamed: 0,country,job,location
0,Zimbabwe,Teacher,Beitbridge
1,Zimbabwe,Teacher,Harare
2,Zimbabwe,Nurse,Gweru
3,Zimbabwe,Doctor,Rusape
4,Zimbabwe,Nurse,Chipinge


### cleaning country first

In [18]:
subset.country.value_counts()

Zimbabwe    99787
zimbabwe      100
Zim            13
Name: country, dtype: int64

### it seems that the only value under country is Zimbabwe so will fill every missing value with `Zimbabwe`

### But first:

### __making the country codes uniform__

__converting `zimbabwe` to `Zimbabwe`__

In [19]:
subset['country']=subset.country.str.title()

__viewing changes__

In [20]:
subset.country.value_counts()

Zimbabwe    99887
Zim            13
Name: country, dtype: int64

__now converting `Zim` to `Zimbabwe`__

In [21]:
subset['country']=subset.country.apply(lambda x:'Zimbabwe' if x=='Zim' else x)

__viewing changes__

In [22]:
subset.country.value_counts()

Zimbabwe    99900
Name: country, dtype: int64

In [23]:
rows

100000

__now that all values for country are uniform,__

__filling missing values with `Zimbabwe`__

In [24]:
subset['country']=subset.country.fillna('Zimbabwe')

In [25]:
df['country']=subset.country

__getting rid of imputed feature in subset__

In [26]:
subset=subset.drop('country',axis=1)

In [27]:
subset.head()

Unnamed: 0,job,location
0,Teacher,Beitbridge
1,Teacher,Harare
2,Nurse,Gweru
3,Doctor,Rusape
4,Nurse,Chipinge


__inspecting the 2 remaining null features__

In [28]:
subset.describe().transpose()

Unnamed: 0,count,unique,top,freq
job,95864,11,Engineer,16524
location,99405,157,Harare,8338


In [29]:
subset.job.value_counts()

Engineer              16524
Nurse                 15284
Data Analyst          13204
Doctor                12186
Software Developer    11932
Teacher                8950
Accountant             7802
SoftwareDeveloper      3564
Data Scientist         3521
Lawyer                 2862
Data Scintist            35
Name: job, dtype: int64

In [30]:
subset.location.value_counts()

Harare                8338
Bulawayo              8078
Mutare                8062
Gweru                 7803
Masvingo              7476
                      ... 
Gokwe                    1
Victoria Falls           1
   Gokwe                 1
 Victoria Falls          1
   Zvishavane            1
Name: location, Length: 157, dtype: int64

### imputing missing values in original dataset

In [31]:
import numpy as np
from sklearn.impute import KNNImputer

# Assuming your dataset is stored in a pandas DataFrame 'df'
# and the missing values are denoted by 'nan'

# Identify the features with missing values
features_with_missing = ['job', 'location']

# Impute the missing values using kNN
imputer = KNNImputer(n_neighbors=5, weights='uniform')
df[features_with_missing] = imputer.fit_transform(df[features_with_missing])

ImportError: cannot import name 'KNNImputer' from 'sklearn.impute' (C:\Users\t470p\Anaconda3\lib\site-packages\sklearn\impute\__init__.py)