<a href="https://colab.research.google.com/github/TaylorW-12/SVM/blob/main/Copy_of_Project_2_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Data Processing
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder

#Data Exploration
import plotly.express as px

#Modeling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.svm import SVR

#Evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
#Import Data
data=pd.read_csv("LC_HW2.csv")

data.head()



Unnamed: 0,all_util,annual_inc,application_type,chargeoff_within_12_mths,collections_12_mths_ex_med,delinq_2yrs,dti,emp_length,fico_range_high,fico_range_low,...,mo_sin_old_rev_tl_op,mort_acc,mths_since_last_record,mths_since_recent_inq,open_acc,pub_rec_bankruptcies,term,total_bal_ex_mort,verification_status,int_rate
0,80.0,36000.0,Individual,0,0,0,26.33,,674,670,...,35,0,,0.0,5,0,36 months,34683,Verified,18.62
1,61.0,45000.0,Individual,0,0,0,38.51,1 year,734,730,...,103,0,,0.0,14,0,60 months,63873,Not Verified,16.08
2,31.0,53040.0,Individual,0,0,0,25.2,< 1 year,809,805,...,145,4,,0.0,9,0,36 months,24452,Verified,7.56
3,87.0,125000.0,Individual,0,0,0,27.87,10+ years,684,680,...,230,4,,19.0,14,0,36 months,141033,Verified,17.3
4,75.0,73000.0,Individual,0,0,0,35.12,10+ years,759,755,...,126,1,,13.0,19,0,36 months,160302,Not Verified,10.81


In [None]:
# Data Preprocessing
#Removing columns due to data leakage
data=data.drop(columns=['loan_amnt','term','application_type'])
#Potentially, these variables could lead to data leakage if they reflect after loan application:
#loan status, mths_since_last_record, mths_since_last_record,mths_since_rcnt_il,mths_since_recent_bc,mths_since_recent_inq
#Handing missing values
data.isna().sum()

#Check if normally distributed
#fig5=px.histogram(data,x="dti")
#fig6=px.histogram(data,x="emp_length")
#fig7=px.histogram(data,x="inq_last_12m")
#fig8=px.histogram(data,x="mths_since_last_record")
#fig9=px.histogram(data,x="mths_since_recent_inq")
#fig5.show()
#fig6.show()
#fig7.show()
#fig8.show()
#fig9.show()

#Remove outliers
data=data[(data['fico_range_low'] < 800) & (data['all_util'] < 100)]
data=data[data['annual_inc']<=data['annual_inc'].quantile(.95)]

data['dti']=data['dti'].fillna(data['dti'].mean())
data['emp_length']=data['emp_length'].fillna(data['emp_length'].mode())
data['mo_sin_old_il_acct']=data['mo_sin_old_il_acct'].fillna(data['mo_sin_old_il_acct'].mean())
data['inq_last_12m']=data['inq_last_12m'].fillna(data['inq_last_12m'].mean())
data['mths_since_last_record']=data['mths_since_last_record'].fillna(data['mths_since_last_record'].mean())
data['mths_since_recent_inq']=data['mths_since_recent_inq'].fillna(data['mths_since_recent_inq'].mean())


cat_cols=['verification_status','emp_length','home_ownership']
ohe=OneHotEncoder()
ohe_features=ohe.fit_transform(data[cat_cols]).toarray()
ohe_labels=ohe.get_feature_names_out(cat_cols)
ohe_features=pd.DataFrame(ohe_features,columns=ohe_labels)
data=pd.concat([data.drop(columns=cat_cols),ohe_features],axis=1)

data=data.dropna()

In [None]:
data.isna().sum()
data=data.drop(columns=['verification_status_Not Verified','home_ownership_RENT','mths_since_recent_inq'])
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84790 entries, 0 to 92082
Data columns (total 32 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   all_util                      84790 non-null  float64
 1   annual_inc                    84790 non-null  float64
 2   chargeoff_within_12_mths      84790 non-null  float64
 3   collections_12_mths_ex_med    84790 non-null  float64
 4   delinq_2yrs                   84790 non-null  float64
 5   dti                           84790 non-null  float64
 6   fico_range_high               84790 non-null  float64
 7   fico_range_low                84790 non-null  float64
 8   inq_last_12m                  84790 non-null  float64
 9   mo_sin_old_il_acct            84790 non-null  float64
 10  mo_sin_old_rev_tl_op          84790 non-null  float64
 11  mort_acc                      84790 non-null  float64
 12  mths_since_last_record        84790 non-null  float64
 13  open_a

In [None]:
# Data Modeling
X=data.loc[:,data.columns!='int_rate']
Y=data[['int_rate']]

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.25,random_state=123)

# Scaling data to mean of 0 and standard deviation of 1 (i.e., standardization)
scaler=StandardScaler().fit(X_train)
X_train_s=scaler.transform(X_train)
X_test_s=scaler.transform(X_test)

# Model fitting and prediction
param_grid={
    'C':[0.5,1],
    'kernel':['linear','rbf']}



In [None]:
SVMreg=SVR()

In [None]:
#Use grid search to find the best hyperparameters
grid_search=GridSearchCV(SVMreg,
                         param_grid,
                         cv=5)

# Fit the grid search object ot the data
grid_search.fit(X_train_s,np.ravel(Y_train))

#Obtain and save the values of the best set of hyperparameters
best_svm=grid_search.best_estimator_

print('Best hyperparameters:',grid_search.best_params_)

In [None]:
#Generate predictions with the best model
Y_pred_tuned=best_svm.predict(X_test_s)

print("RMSE:",mean_squared_error(Y_test,Y_pred_tuned))
print("MAE:",mean_absolute_error(Y_test,Y_pred_tuned))