### Import essential library and read csv

In [1]:
import pandas as pd

dataset = pd.read_csv("test.csv",header=0,encoding='utf-8')
df = dataset.copy()

In [2]:
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,66415476,71141331,20000,20000,20000.0,36 months,16.99,712.96,D,D3,...,1554.0,51.8,1.0,1.0,3527.0,78.5,8100.0,1.0,0.0,1.0
1,65199509,69726237,3700,3700,3700.0,36 months,12.59,123.94,C,C2,...,,,,,,,18700.0,,,
2,123688,123685,1800,1800,775.0,36 months,17.22,64.38,G,G3,...,,,,,,,,,,
3,54067210,57607924,24000,24000,24000.0,60 months,7.89,485.38,A,A5,...,,,,,,,84200.0,,,
4,60565847,64586591,10000,10000,9900.0,36 months,6.89,308.27,A,A3,...,,,,,,,15900.0,,,


## Get dummy variables


The get_dummies() function is used to convert categorical variable into dummy/indicator variables

In [3]:
df = pd.get_dummies(df,columns=['grade','sub_grade','home_ownership',
                                'verification_status','loan_status','pymnt_plan'
                                ,'initial_list_status','application_type'],drop_first=True)

In [4]:
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_title,emp_length,...,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,loan_status_Current,loan_status_Does not meet the credit policy. Status:Charged Off,loan_status_Does not meet the credit policy. Status:Fully Paid,loan_status_Fully Paid,initial_list_status_w,application_type_JOINT
0,66415476,71141331,20000,20000,20000.0,36 months,16.99,712.96,,,...,0,0,0,0,1,0,0,0,1,1
1,65199509,69726237,3700,3700,3700.0,36 months,12.59,123.94,,,...,0,0,0,0,1,0,0,0,0,1
2,123688,123685,1800,1800,775.0,36 months,17.22,64.38,"disabled, student",1 year,...,0,1,0,0,0,0,1,0,0,0
3,54067210,57607924,24000,24000,24000.0,60 months,7.89,485.38,reg nurse,10+ years,...,0,0,1,0,1,0,0,0,0,0
4,60565847,64586591,10000,10000,9900.0,36 months,6.89,308.27,Interim Director of Case Management,3 years,...,0,0,1,0,1,0,0,0,0,0


In [5]:
selected_features = ['grade_C','grade_D','grade_E',
                     'grade_F','grade_G','total_rec_int',
                     'total_pymnt_inv','funded_amnt_inv','sub_grade_B5',
                     'sub_grade_B4','sub_grade_C5','sub_grade_C4',
                     'sub_grade_C3','sub_grade_D5','int_rate']

features = df[selected_features]

In [6]:
# Converting Dataframes to Individual Arrays of Features and Labels to Fit to a Model
import numpy as np

# Labels are the values we want to predict
labels = np.array(features['int_rate'])

# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('int_rate', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

### A random forest fits a number of classifying decision trees on various sub-samples of the dataset to improve ### the predictive accuracy and control over-fitting.

In [7]:
from sklearn.ensemble import RandomForestRegressor
rf_best = RandomForestRegressor(n_estimators= 20,
 min_samples_split=2,
 min_samples_leaf=2,
 max_features='auto',
 max_depth=60,
 bootstrap=True)

In [8]:
rf_best.fit(features,labels)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=60, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=20, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [9]:
predictions = rf_best.predict(features)

In [10]:
predictions

array([15.45205   , 13.18866429, 13.07884762,  8.82462738,  8.70841845,
       15.62070833, 15.44728333, 15.04660833, 13.14491667, 19.55709464,
       12.01020595, 17.50976667, 11.80202083,  9.29915417, 11.44777083,
       11.46739762, 10.80984762, 10.11403333, 16.3762    , 12.82758333,
        8.68580595, 18.33322024, 11.342725  , 20.59035   , 13.38099167,
       19.7840375 , 19.98272024, 12.83473333, 12.4562125 , 10.11714524,
       20.47479167, 20.13001667,  9.636425  , 19.17808452, 18.49259762,
       18.48050238, 20.276575  , 19.99867381, 19.055625  ,  8.80055   ,
       13.63444167, 13.4324    , 20.19495714, 17.78128452, 17.2331619 ,
       19.75309464, 21.285925  , 10.33614524, 10.31639167, 19.02224702,
       14.46711667, 13.917175  , 13.78795833, 18.41431905, 16.94885595,
       14.95031667, 18.45272738, 18.28265238, 10.65741905, 17.93094167,
       16.44443452, 20.8069    ,  9.21826071, 14.86779762, 20.38607857,
       20.03056607, 12.20909167, 16.34633929, 14.01038333, 15.59

In [11]:
pred = pd.DataFrame(data = predictions,columns = ['predicted_int_rate'])

In [13]:
pred.to_csv("predictions.csv")