In [34]:
import pandas as pd
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn import ensemble
from yellowbrick.regressor import ResidualsPlot
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from scipy.stats.mstats import winsorize
import warnings
warnings.filterwarnings('ignore')

In [35]:
data = pd.read_csv('Train.csv')

In [36]:
#function for dropping columns
def drop_col(df,col_list):
    for col in col_list:
        if col in df.columns:
            df.drop(col, axis = 1, inplace=True)
    return df

def plot_scatter(dataframe, target_col):
    plt.figure()
    for col in dataframe.columns:
        if col != target_col:
            plt.figure(figsize=(12,7))
            plt.scatter(x=dataframe[col], y=dataframe[target_col])
            plt.xlabel(col)
            plt.ylabel(target_col)
            
#outlier analysis
def plot_boxplots(dataframe):
    for col in dataframe.columns:
        plt.figure()
        dataframe.boxplot([col])
        
#checking the number of rows for each variable that fall outside 3*IQR range
def get_outliers_count(dataframe):
    count = {}
    for col in dataframe.columns:
        sorted_col = sorted(dataframe[col])
        Q1,Q3 = np.percentile(sorted_col , [25,75])
        IQR = Q3 - Q1
        lower_range = Q1 - (3 * IQR)
        upper_range = Q3 + (3 * IQR)    
        outliers_df = dataframe[(dataframe[col] > upper_range) | (dataframe[col] < lower_range)]
        count[col] = outliers_df[col].count()
    return count

#removes outliers which are not in 3*IQR range
def remove_outliers(dataframe, column):
    sorted_col = sorted(dataframe[column])
    Q1,Q3 = np.percentile(sorted_col , [25,75])
    IQR = Q3 - Q1
    lower_range = Q1 - (1.5 * IQR)
    upper_range = Q3 + (1.5 * IQR)  
    dataframe_out = dataframe[(dataframe[column] > lower_range) & (dataframe[column] < upper_range)]
    return dataframe_out

def rmsle(actual_column, predicted_column):
    sum=0.0
    for x,y in zip(actual_column,predicted_column):
        if x<0 or y<0: #check for negative values. 
            continue
        p = np.log(y+1)
        r = np.log(x+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted_column))**0.5

def treat_outliers(dataframe, col_list):
    for col in col_list:
        dataframe[col] = winsorize(dataframe[col], limits=[0.05, 0.05],inclusive=(True, True))
    return dataframe

#for other variables using standard scaler
def scale(dataframe, cols_list):
    scaler = StandardScaler()
    dataframe[cols_list] = scaler.fit_transform(dataframe[cols_list])
    return dataframe,scaler

In [37]:
not_req = ['ID','region_code','personal_loan_active','vehicle_loan_active','personal_loan_closed','vehicle_loan_closed','loan_enq']

numerical_cols = ['age', 'cc_cons_apr', 'dc_cons_apr', 'cc_cons_may', 'dc_cons_may',
       'cc_cons_jun', 'dc_cons_jun', 'cc_count_apr', 'cc_count_may',
       'cc_count_jun', 'dc_count_apr', 'dc_count_may', 'dc_count_jun',
       'card_lim', 'investment_1', 'investment_2', 'investment_3',
       'investment_4', 'debit_amount_apr', 'credit_amount_apr',
       'debit_count_apr', 'credit_count_apr', 'max_credit_amount_apr',
       'debit_amount_may', 'credit_amount_may', 'credit_count_may',
       'debit_count_may', 'max_credit_amount_may', 'debit_amount_jun',
       'credit_amount_jun', 'credit_count_jun', 'debit_count_jun',
       'max_credit_amount_jun', 'emi_active']

def custom_pipeline(dataframe, scaler):
    #dropping unnecessary columns
    not_req = ['ID','region_code','personal_loan_active','vehicle_loan_active','personal_loan_closed','vehicle_loan_closed','loan_enq']
    dataframe = dataframe.drop(not_req, axis=1)
    
    #one hot encoding on categorical data
    dataframe = pd.get_dummies(data=dataframe, columns=['account_type', 'gender'])
    
    #list of numerical and categorical columns
    numerical_cols = ['age', 'cc_cons_apr', 'dc_cons_apr', 'cc_cons_may', 'dc_cons_may',
       'cc_cons_jun', 'dc_cons_jun', 'cc_count_apr', 'cc_count_may',
       'cc_count_jun', 'dc_count_apr', 'dc_count_may', 'dc_count_jun',
       'card_lim', 'investment_1', 'investment_2', 'investment_3',
       'investment_4', 'debit_amount_apr', 'credit_amount_apr',
       'debit_count_apr', 'credit_count_apr', 'max_credit_amount_apr',
       'debit_amount_may', 'credit_amount_may', 'credit_count_may',
       'debit_count_may', 'max_credit_amount_may', 'debit_amount_jun',
       'credit_amount_jun', 'credit_count_jun', 'debit_count_jun',
       'max_credit_amount_jun', 'emi_active']
    cat_cols = ['account_type_current','account_type_saving','gender_F','gender_M']
    
    #scaling on test data using same scaler used to fit on train data
    dataframe['investment_4'] = dataframe['investment_4'] - dataframe['investment_4'].min()
    dataframe[numerical_cols] = np.log1p(dataframe[numerical_cols])
    dataframe[numerical_cols] = scaler.transform(dataframe[numerical_cols])
    return dataframe

In [38]:
#removing unwanted columns 
data = drop_col(data, not_req)

In [39]:
#encoding categorical data using one hot encoding
data = pd.get_dummies(data=data, columns=['account_type', 'gender'])
data.head(5)

Unnamed: 0,age,cc_cons_apr,dc_cons_apr,cc_cons_may,dc_cons_may,cc_cons_jun,dc_cons_jun,cc_count_apr,cc_count_may,cc_count_jun,...,credit_amount_jun,credit_count_jun,debit_count_jun,max_credit_amount_jun,emi_active,cc_cons,account_type_current,account_type_saving,gender_F,gender_M
0,35,24893.0,378.0,10288.0,29664.0,16291.4,11432.0,2.0,26.0,10.0,...,12761.0,2.0,65.0,50836.0,1674.09,20014.0,1,0,0,1
1,35,18941.62,966.0,20672.0,287.0,4217.0,885.0,1.0,7.0,13.0,...,76206.0,21.0,63.0,23226.0,13043.34,10173.0,1,0,0,1
2,55,5678.87,2724.0,1964.5,3933.11,23956.25,5168.0,43.0,32.0,102.0,...,75283.0,7.0,1.0,27036.0,25375.27,16095.0,1,0,0,1
3,29,30489.5,1236.0,12609.88,9138.14,17521.0,13650.3,53.0,1.0,50.0,...,68708.0,21.0,83.0,43037.0,3544.33,7707.0,1,0,1,0
4,34,7441.4,6906.04,4364.0,1939.0,2121.0,6829.18,67.0,47.0,82.0,...,80140.0,8.0,32.0,32044.0,12780.44,96408.0,1,0,0,1


In [40]:
data = remove_outliers(data, 'age')

In [41]:
cat_cols = ['account_type_current','account_type_saving','gender_F','gender_M']

In [42]:
X = data.drop(['cc_cons'], axis=1) 
y = np.log1p(data['cc_cons'])

In [43]:
X.shape

(14466, 38)

In [44]:
X.shape

(14466, 38)

In [45]:
X['investment_4'] = X['investment_4']-X['investment_4'].min()

In [46]:
count_dict = get_outliers_count(X)
outliers_count_df = pd.DataFrame(count_dict.items(), columns=['Column', 'Outliers count'])
outliers_count_df

Unnamed: 0,Column,Outliers count
0,age,0
1,cc_cons_apr,462
2,dc_cons_apr,592
3,cc_cons_may,539
4,dc_cons_may,595
5,cc_cons_jun,533
6,dc_cons_jun,638
7,cc_count_apr,501
8,cc_count_may,71
9,cc_count_jun,46


In [47]:
X[numerical_cols] = np.log1p(X[numerical_cols])

In [48]:
scaled_df,scaler_train = scale(X, numerical_cols)

In [49]:
#assessing linear model on this data
X_train, X_test, y_train, y_test = train_test_split(scaled_df, y, test_size=0.33, random_state=12)
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)
print("Training score: "+str(linear_model.score(X_train, y_train)))
print("Test score: "+str(linear_model.score(X_test, y_test)))

Training score: 0.0043837829757321645
Test score: -0.004734932316480256


In [50]:
print(rmsle(y_test, y_pred))

0.15884379936936988


In [51]:
rf_model = RandomForestRegressor(random_state=0)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Training score: "+str(rf_model.score(X_train, y_train)))
print("Test score: "+str(rf_model.score(X_test, y_test)))

Training score: 0.7955635685340648
Test score: -0.10301040888261494


In [52]:
print(rmsle(np.expm1(y_test), np.expm1(y_pred_rf)))

1.7027733045127529


In [53]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}# Create a based model
rf = RandomForestRegressor()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 17.6min


In [26]:
best_grid = grid_search.best_estimator_

In [27]:
best_grid

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=90,
                      max_features=2, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=5, min_samples_split=12,
                      min_weight_fraction_leaf=0.0, n_estimators=200,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [28]:
grid_pred = best_grid.predict(X_test)

In [30]:
print(rmsle(y_test, grid_pred))

0.16440084793568022


In [32]:
test_data = pd.read_csv('Test.csv')
test_scaled = custom_pipeline(test_data, scaler_train)
test_scaled.head()

Unnamed: 0,age,cc_cons_apr,dc_cons_apr,cc_cons_may,dc_cons_may,cc_cons_jun,dc_cons_jun,cc_count_apr,cc_count_may,cc_count_jun,...,debit_amount_jun,credit_amount_jun,credit_count_jun,debit_count_jun,max_credit_amount_jun,emi_active,account_type_current,account_type_saving,gender_F,gender_M
0,-0.384427,-1.062991,-0.705334,-0.069603,-1.311025,0.466449,1.703932,-0.795402,-0.463992,-1.598385,...,-0.212189,-0.090714,-0.906082,-0.769643,0.105742,-0.591641,1,0,0,1
1,-0.78852,1.300136,-0.49818,-1.312001,-0.13128,0.643291,-0.909909,-0.997199,-0.463992,-1.598385,...,-0.98273,-0.619953,-0.208296,0.261451,-0.491123,1.313783,1,0,0,1
2,0.892236,-0.298561,0.648286,-0.855073,-1.197684,-2.038255,0.464154,-0.370359,0.492789,-1.427998,...,-0.148233,-0.032286,-0.400507,-1.010336,-0.428723,-0.010233,1,0,0,1
3,-0.78852,-1.347346,1.580838,1.58482,-2.584802,-0.284996,1.953193,-1.624038,1.574319,0.894242,...,0.930405,0.954314,1.219571,-3.122729,0.917804,-1.480062,1,0,0,1
4,1.452281,-0.905869,0.164021,-0.704076,-0.04781,-0.514527,0.665158,-1.25736,-0.463992,0.515319,...,-0.695288,0.143398,-0.208296,0.164793,0.493779,0.040179,1,0,0,1


In [33]:
rf_grid_pred = best_grid.predict(test_scaled)
to_submit = pd.DataFrame({'ID': test_data['ID'], 'cc_cons': np.expm1(rf_grid_pred)} )
to_submit.to_csv('rf_grid_pred_olremoved.csv', index=False)
to_submit.head()

Unnamed: 0,ID,cc_cons
0,17591,15473.865229
1,13541,15105.738643
2,13431,11039.123983
3,8687,17536.488842
4,14727,9742.451236
