In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### EDA & Feature Engineering

In [93]:
data=pd.read_csv("../Data/Car Ownership.csv")

In [94]:
data.head()

Unnamed: 0,Occupation,Monthly Income,Credit Score,Years of Employment,Finance Status,Finance History,Car,Number of Children
0,Teacher,40000,750.0,10,Good,No issues,Yes,2
1,Engineer,60000,800.0,8,Good,No issues,Yes,1
2,Nurse,25000,650.0,5,Fair,Late payment 2 months ago,No,0
3,Doctor,80000,900.0,12,Excellent,No issues,Yes,3
4,Businessman,100000,700.0,15,Good,Late payment 6 months ago,Yes,2


In [95]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Occupation           490 non-null    object 
 1   Monthly Income       487 non-null    object 
 2   Credit Score         460 non-null    float64
 3   Years of Employment  457 non-null    object 
 4   Finance Status       477 non-null    object 
 5   Finance History      472 non-null    object 
 6   Car                  480 non-null    object 
 7   Number of Children   375 non-null    object 
dtypes: float64(1), object(7)
memory usage: 31.4+ KB


In [96]:
data.describe()

Unnamed: 0,Credit Score
count,460.0
mean,702.969565
std,69.232323
min,500.0
25%,650.0
50%,700.0
75%,750.0
max,900.0


In [97]:
data['Car'].value_counts()

Yes    301
No     179
Name: Car, dtype: int64

In [98]:
for col in data.columns:
    print(f"--------------{col}--------------")
    print(data[col].value_counts())
    print("\n")

--------------Occupation--------------
Chef                         34
Electrician                  19
Graphic Designer             17
Nurse                        16
Architect                    13
                             ..
Retail Salesperson            1
Bartender                     1
Librarian                     1
Elementary School Teacher     1
Customer Support              1
Name: Occupation, Length: 116, dtype: int64


--------------Monthly Income--------------
$4,500       33
$3,500       27
$3,200       20
$4,200       18
$3,000       18
             ..
10000 USD     1
3500 USD      1
2.5k          1
6000 usd      1
$7            1
Name: Monthly Income, Length: 98, dtype: int64


--------------Credit Score--------------
670.0    32
680.0    31
720.0    29
650.0    27
690.0    22
         ..
830.0     1
770.0     1
890.0     1
840.0     1
641.0     1
Name: Credit Score, Length: 64, dtype: int64


--------------Years of Employment--------------
4 years     74
3 years     

#### converting columns into their correct format

In [99]:
def removeUnnecessaryString(element):
    final=""
    for c in str(element):
        if c.isnumeric():
            final+=c
    if final!="":
        return float(final)
    else:
        return np.nan

In [100]:
data

Unnamed: 0,Occupation,Monthly Income,Credit Score,Years of Employment,Finance Status,Finance History,Car,Number of Children
0,Teacher,40000,750.0,10,Good,No issues,Yes,2
1,Engineer,60000,800.0,8,Good,No issues,Yes,1
2,Nurse,25000,650.0,5,Fair,Late payment 2 months ago,No,0
3,Doctor,80000,900.0,12,Excellent,No issues,Yes,3
4,Businessman,100000,700.0,15,Good,Late payment 6 months ago,Yes,2
...,...,...,...,...,...,...,...,...
495,Project Manager,"$7,000",730.0,5 years,Stable,No significant issues,Yes,0
496,Chef,"$4,500",680.0,4 years,Stable,No significant issues,Yes,0
497,Interior Designer,"$5,500",690.0,4 years,Stable,No significant issues,Yes,1
498,Medical Assistant,"$3,500",640.0,3 years,Stable,No significant issues,No,1


In [101]:
data['Monthly Income']=data['Monthly Income'].apply(removeUnnecessaryString)
data['Years of Employment']=data['Years of Employment'].apply(removeUnnecessaryString)
data['Number of Children']=data['Number of Children'].apply(removeUnnecessaryString)

#### Handle null values

In [102]:
data.isna().sum()

Occupation              10
Monthly Income          14
Credit Score            40
Years of Employment     43
Finance Status          23
Finance History         28
Car                     20
Number of Children     154
dtype: int64

In [103]:
for col in data.columns:
    if data[col].isnull().any():
        print(f"{col}-------{(data[col].isnull().sum()/data.shape[0])*100:.2f}")

Occupation-------2.00
Monthly Income-------2.80
Credit Score-------8.00
Years of Employment-------8.60
Finance Status-------4.60
Finance History-------5.60
Car-------4.00
Number of Children-------30.80


##### handle it using statistic properties 

In [104]:
categorical_columns=[]
numerical_columns=[]

for col in data.columns:
    if data[col].dtype=="object":
        categorical_columns.append(col)
    else:
        numerical_columns.append(col)

In [105]:
numerical_columns

['Monthly Income', 'Credit Score', 'Years of Employment', 'Number of Children']

In [106]:
X=data.drop('Car',axis=1)
Y=data['Car']

In [107]:
from sklearn.model_selection import train_test_split

In [108]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [109]:
qualitative_null_value_columns=[]
quantitative_null_value_columns=[]
for column in X_test.columns:
    if data[column].isnull().any() and data[column].dtype != "object":
        quantitative_null_value_columns.append(column)
    else:
        qualitative_null_value_columns.append(column)

In [110]:
qualitative_null_value_columns

['Occupation', 'Finance Status', 'Finance History']

In [111]:
from sklearn.impute import SimpleImputer

In [112]:
numeric_imputer=SimpleImputer(strategy='mean')

In [113]:
X_train[quantitative_null_value_columns]=numeric_imputer.fit_transform(X_train[quantitative_null_value_columns])

In [114]:
X_test[quantitative_null_value_columns]=numeric_imputer.transform(X_test[quantitative_null_value_columns])

In [115]:
categorical_imputer=SimpleImputer(strategy="most_frequent")

In [116]:
X_train[qualitative_null_value_columns]=categorical_imputer.fit_transform(X_train[qualitative_null_value_columns])
X_test[qualitative_null_value_columns]=categorical_imputer.transform(X_test[qualitative_null_value_columns])

In [117]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 107 to 172
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Occupation           400 non-null    object 
 1   Monthly Income       400 non-null    float64
 2   Credit Score         400 non-null    float64
 3   Years of Employment  400 non-null    float64
 4   Finance Status       400 non-null    object 
 5   Finance History      400 non-null    object 
 6   Number of Children   400 non-null    float64
dtypes: float64(4), object(3)
memory usage: 25.0+ KB


In [118]:
X_train['Number of Children']=X_train['Number of Children'].astype(int)
X_test['Number of Children']=X_test['Number of Children'].astype(int)

In [146]:
Y_train=Y_train.to_frame()
Y_test=Y_test.to_frame()

#### Encode the categorical variable

In [119]:
categorical_columns[0:3]

['Occupation', 'Finance Status', 'Finance History']

In [120]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [121]:
X_train

Unnamed: 0,Occupation,Monthly Income,Credit Score,Years of Employment,Finance Status,Finance History,Number of Children
107,Graphic Designer,3200.0,670.000000,3.0,Stable,No significant issues,0
336,Social Worker,3600.0,703.716621,2.0,Unstable,Missed payments in the past,3
71,Chef,2800.0,630.000000,1.0,Unstable,Missed payments in the past,1
474,Public Relations,5800.0,690.000000,4.0,Stable,No significant issues,0
6,Teacher,35000.0,700.000000,8.0,Fair,Late payment 3 months ago,1
...,...,...,...,...,...,...,...
323,Insurance Agent,4200.0,680.000000,3.0,Fair,No significant issues,1
192,Marketing Analyst,5000.0,730.000000,4.0,Stable,No significant issues,0
117,Optometrist,9000.0,820.000000,8.0,Stable,No significant issue,3
47,Web Developer,6000.0,730.000000,5.0,Stable,No significant issues,1


In [130]:
categorical_encoder=OneHotEncoder(handle_unknown="ignore")

In [131]:
column_transformer = ColumnTransformer(transformers=[('encoder', categorical_encoder , categorical_columns[0:3])], remainder='passthrough')

In [132]:
column_transformer.fit(X_train)

In [133]:
X_train_encoded=column_transformer.transform(X_train).toarray()

In [134]:
X_train_encoded.shape

(400, 137)

In [135]:
column_transformer.get_feature_names()



['encoder__x0_Account Executive',
 'encoder__x0_Account Manager',
 'encoder__x0_Accountant',
 'encoder__x0_Architect',
 'encoder__x0_Art Director',
 'encoder__x0_Attorney',
 'encoder__x0_Automotive Mechanic',
 'encoder__x0_Bank Teller',
 'encoder__x0_Barista',
 'encoder__x0_Bartender',
 'encoder__x0_Business Analyst',
 'encoder__x0_Business Owner',
 'encoder__x0_Businessman',
 'encoder__x0_Carpenter',
 'encoder__x0_Chef',
 'encoder__x0_Civil Engineer',
 'encoder__x0_Computer Programmer',
 'encoder__x0_Construction Worker',
 'encoder__x0_Copywriter',
 'encoder__x0_Customer Service',
 'encoder__x0_Customer Service Rep',
 'encoder__x0_Customer Support',
 'encoder__x0_Data Analyst',
 'encoder__x0_Data Scientist',
 'encoder__x0_Dental Assistant',
 'encoder__x0_Dental Hygienist',
 'encoder__x0_Dentist',
 'encoder__x0_Designer',
 'encoder__x0_Doctor',
 'encoder__x0_Electrical Engineer',
 'encoder__x0_Electrician',
 'encoder__x0_Engineer',
 'encoder__x0_Entrepreneur',
 'encoder__x0_Event Plann

In [136]:
X_test_encoded=column_transformer.transform(X_test).toarray()

In [145]:
def ownLabelEncodder(x):
    if x=="Yes":
        return 1
    else:
        return 0

In [144]:
Y_train.apply(ownLabelEncodder)

107    0
336    0
71     0
474    1
6      1
      ..
323    1
192    1
117    1
47     1
172    0
Name: Car, Length: 400, dtype: int64