In [193]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer, make_column_transformer
import matplotlib.pyplot as plt
from scipy import stats

In [194]:
df_test = pd.read_csv('test.csv')

In [195]:
df_test.head(20)

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active
0,VBENBARO,Male,29,RG254,Other,X1,25,Yes,742366,No
1,CCMEWNKY,Male,43,RG268,Other,X2,49,,925537,No
2,VK3KGA9M,Male,31,RG270,Salaried,X1,14,No,215949,No
3,TT8RPZVC,Male,29,RG272,Other,X1,33,No,868070,No
4,SHQZEYTZ,Female,29,RG270,Other,X1,19,No,657087,No
5,MZZAQMPT,Male,60,RG268,Self_Employed,X3,110,No,4624262,No
6,Y88TW36I,Female,69,RG253,Other,X2,67,No,1032764,No
7,3UGOAQNU,Female,30,RG257,Salaried,X1,33,No,837009,No
8,VVUKMUJT,Male,43,RG284,Salaried,X3,81,,1001232,Yes
9,9R363ZXS,Female,54,RG283,Self_Employed,X2,37,Yes,1667851,No


In [196]:
df_test.isnull().sum()

ID                         0
Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         12522
Avg_Account_Balance        0
Is_Active                  0
dtype: int64

In [197]:
df_test.shape

(105312, 10)

In [198]:
df = df_test.drop(['ID'], axis=1)

In [199]:
df.columns

Index(['Gender', 'Age', 'Region_Code', 'Occupation', 'Channel_Code', 'Vintage',
       'Credit_Product', 'Avg_Account_Balance', 'Is_Active'],
      dtype='object')

### Scaling 

In [200]:
#selection of columns to scale
df_num = df.loc[:,['Age','Avg_Account_Balance','Vintage']]

In [201]:
#scaling
ct =ColumnTransformer(
    [('std-sclr',StandardScaler(),['Age','Avg_Account_Balance','Vintage'])
    ],
    remainder = 'passthrough'
)

num_scaled = ct.fit_transform(df_num)
df_num_scaled = pd.DataFrame(num_scaled, columns=df_num.columns,index=df_num.index)
df_num_scaled.head()

Unnamed: 0,Age,Avg_Account_Balance,Vintage
0,-0.999878,-0.452333,-0.676804
1,-0.058563,-0.240878,0.066841
2,-0.865404,-1.060037,-1.017641
3,-0.999878,-0.307219,-0.428922
4,-0.999878,-0.550781,-0.862715


In [202]:
#joining back the scaled result
df=df.drop(['Age','Avg_Account_Balance','Vintage'], axis=1)
df =df.join(df_num_scaled)

In [203]:
df.fillna('Missing',inplace=True)

### Encoding

In [204]:
#Encoding
df = pd.get_dummies(df, columns=['Gender', 'Region_Code', 'Occupation', 'Channel_Code','Is_Active'])

In [205]:
# Dropping last columns of each feature, to avoid dummy variable trap 
df.drop(['Gender_Female','Is_Active_No','Channel_Code_X4','Occupation_Entrepreneur','Region_Code_RG284'], axis=1, inplace=True)

In [206]:
df.columns

Index(['Credit_Product', 'Age', 'Avg_Account_Balance', 'Vintage',
       'Gender_Male', 'Region_Code_RG250', 'Region_Code_RG251',
       'Region_Code_RG252', 'Region_Code_RG253', 'Region_Code_RG254',
       'Region_Code_RG255', 'Region_Code_RG256', 'Region_Code_RG257',
       'Region_Code_RG258', 'Region_Code_RG259', 'Region_Code_RG260',
       'Region_Code_RG261', 'Region_Code_RG262', 'Region_Code_RG263',
       'Region_Code_RG264', 'Region_Code_RG265', 'Region_Code_RG266',
       'Region_Code_RG267', 'Region_Code_RG268', 'Region_Code_RG269',
       'Region_Code_RG270', 'Region_Code_RG271', 'Region_Code_RG272',
       'Region_Code_RG273', 'Region_Code_RG274', 'Region_Code_RG275',
       'Region_Code_RG276', 'Region_Code_RG277', 'Region_Code_RG278',
       'Region_Code_RG279', 'Region_Code_RG280', 'Region_Code_RG281',
       'Region_Code_RG282', 'Region_Code_RG283', 'Occupation_Other',
       'Occupation_Salaried', 'Occupation_Self_Employed', 'Channel_Code_X1',
       'Channel_Code_X2'

In [207]:
df.shape

(105312, 46)

In [208]:
# Renaming Columns
df.rename(columns = {'Gender_Male':'Gender', 'Is_Active_Yes':'Is_Active'}, inplace=True)

In [209]:
df['Credit_Product'].value_counts()

No         61608
Yes        31182
Missing    12522
Name: Credit_Product, dtype: int64

In [210]:
df_missing_test = df[df['Credit_Product']=='Missing']
df_missing_train = df[df['Credit_Product']!='Missing']

In [211]:
df_missing_train_x = df_missing_train.drop(['Credit_Product'], axis=1)
df_missing_train_y = df_missing_train['Credit_Product']
df_missing_test_x = df_missing_test.drop(['Credit_Product'], axis=1)

In [212]:
print("X_train_shape:{}".format(df_missing_train_x.shape))
print("Y_train_shape:{}".format(df_missing_train_y.shape))
print("X_test_shape:{}".format(df_missing_test_x.shape))

X_train_shape:(92790, 45)
Y_train_shape:(92790,)
X_test_shape:(12522, 45)


### Dummifying the df_missing_train_y

In [213]:
#Encoding
df_missing_train_y = pd.get_dummies(df_missing_train_y, columns=['Credit_Product'])

In [214]:
df_missing_train_y.columns

Index(['No', 'Yes'], dtype='object')

In [215]:
df_missing_train_y.head()

Unnamed: 0,No,Yes
0,0,1
2,1,0
3,1,0
4,1,0
5,1,0


In [216]:
df_missing_train_y.drop(['No'],axis=1,inplace=True)
df_missing_train_y.rename(columns = {'Yes':'Credit_Product'}, inplace=True)
df_missing_train_y = pd.DataFrame(df_missing_train_y,columns=['Credit_Product'],index = df_missing_train_y.index)
df_missing_train_y.head(2)

Unnamed: 0,Credit_Product
0,1
2,0


In [217]:
df_missing_train_y.dtypes

Credit_Product    uint8
dtype: object

In [218]:
df_missing_train_y['Credit_Product'] = pd.to_numeric(df_missing_train_y['Credit_Product'])

### Predicting missing values

In [219]:
rf_classifier = RandomForestClassifier()
rf_classifier = rf_classifier.fit(df_missing_train_x, df_missing_train_y)
pred_values = rf_classifier.predict(df_missing_test_x)

  


In [220]:
y = pd.DataFrame(pred_values, columns=['Credit_Product'], index = df_missing_test_x.index)

In [221]:
y.value_counts()

Credit_Product
0                 6472
1                 6050
dtype: int64

In [222]:
y.dtypes

Credit_Product    uint8
dtype: object

In [223]:
y.head()


Unnamed: 0,Credit_Product
1,0
8,1
12,0
19,0
22,1


In [224]:
df_missing_predicted_test = df_missing_test_x.join(y)
df_missing_predicted_train=df_missing_train_x.join(df_missing_train_y)

In [225]:
df_final = df_missing_predicted_test.append(df_missing_predicted_train)

In [226]:
df_final.head()

Unnamed: 0,Age,Avg_Account_Balance,Vintage,Gender,Region_Code_RG250,Region_Code_RG251,Region_Code_RG252,Region_Code_RG253,Region_Code_RG254,Region_Code_RG255,...,Region_Code_RG282,Region_Code_RG283,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Is_Active,Credit_Product
1,-0.058563,-0.240878,0.066841,1,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
8,-0.058563,-0.153494,1.058367,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,1
12,-0.1258,1.23257,0.686545,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
19,2.093013,-0.615561,1.430189,1,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
22,1.958539,-0.336469,1.182308,1,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1


In [227]:
df_final.shape

(105312, 46)

In [228]:
df_final.head()

Unnamed: 0,Age,Avg_Account_Balance,Vintage,Gender,Region_Code_RG250,Region_Code_RG251,Region_Code_RG252,Region_Code_RG253,Region_Code_RG254,Region_Code_RG255,...,Region_Code_RG282,Region_Code_RG283,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Is_Active,Credit_Product
1,-0.058563,-0.240878,0.066841,1,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
8,-0.058563,-0.153494,1.058367,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,1
12,-0.1258,1.23257,0.686545,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
19,2.093013,-0.615561,1.430189,1,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
22,1.958539,-0.336469,1.182308,1,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1


In [229]:
df_id = df_test.loc[:,['ID']]

In [230]:
df_final = df_final.join(df_id)


In [231]:
df_final.shape

(105312, 47)

In [232]:
df_final['Credit_Product'] = pd.to_numeric(df_final['Credit_Product'])

In [233]:
df_final.dtypes

Age                         float64
Avg_Account_Balance         float64
Vintage                     float64
Gender                        uint8
Region_Code_RG250             uint8
Region_Code_RG251             uint8
Region_Code_RG252             uint8
Region_Code_RG253             uint8
Region_Code_RG254             uint8
Region_Code_RG255             uint8
Region_Code_RG256             uint8
Region_Code_RG257             uint8
Region_Code_RG258             uint8
Region_Code_RG259             uint8
Region_Code_RG260             uint8
Region_Code_RG261             uint8
Region_Code_RG262             uint8
Region_Code_RG263             uint8
Region_Code_RG264             uint8
Region_Code_RG265             uint8
Region_Code_RG266             uint8
Region_Code_RG267             uint8
Region_Code_RG268             uint8
Region_Code_RG269             uint8
Region_Code_RG270             uint8
Region_Code_RG271             uint8
Region_Code_RG272             uint8
Region_Code_RG273           

In [234]:
df_final.to_csv('test_formatted.csv')