In [2]:
import numpy as np
import pandas as pd
from sklearn import *
import random
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from datetime import datetime
from matplotlib import pyplot as plt

In [3]:
df1= pd.read_csv("G:/Goldman Sachs/Bond Liquidity Prediction/ML_Bond_metadata.csv")
df2=pd.read_csv("G:/Goldman Sachs/Bond Liquidity Prediction/sell.csv")
df3=pd.read_csv("G:/Goldman Sachs/Bond Liquidity Prediction/buy.csv")

In [4]:
df1.rename(columns={'isin': 'id'}, inplace=True)

In [5]:
df1['amtIssued']= np.sqrt(np.log(df1['amtIssued']))
df1['amtOutstanding']= np.sqrt(np.log(df1['amtOutstanding']))
df1.issueDate.replace(np.NaN,-1,inplace=True)
df1.maturity.replace(np.NaN,-1,inplace= True)
df1.issueDate=pd.to_datetime(df1.issueDate)
df1.maturity=pd.to_datetime(df1.maturity)
df1['issue_year']=df1.issueDate.dt.year
df1['issue_month']=df1.issueDate.dt.month
df1['issue_day']=df1.issueDate.dt.dayofweek
df1['maturity_year']=df1.maturity.dt.year
df1['maturity_month']=df1.maturity.dt.month
df1['maturity_day']=df1.maturity.dt.dayofweek
df1.couponFrequency.replace(np.NaN, -1,inplace= 1)
df1.ratingAgency1EffectiveDate.replace(np.NaN,-1,inplace=True)
df1.ratingAgency2EffectiveDate.replace(np.NaN,-1,inplace= True)
df1.ratingAgency1EffectiveDate=pd.to_datetime(df1.ratingAgency1EffectiveDate)
df1.ratingAgency2EffectiveDate=pd.to_datetime(df1.ratingAgency2EffectiveDate)
df1['ratingAgency1_year']=df1.ratingAgency1EffectiveDate.dt.year
df1['ratingAgency1_month']=df1.ratingAgency1EffectiveDate.dt.month
df1['ratingAgency1_day']=df1.ratingAgency1EffectiveDate.dt.dayofweek
df1['ratingAgency2_year']=df1.ratingAgency2EffectiveDate.dt.year
df1['ratingAgency2_month']=df1.ratingAgency2EffectiveDate.dt.month
df1['ratingAgency2_day']=df1.ratingAgency2EffectiveDate.dt.dayofweek
df1['Day_differ']=df1.maturity-df1.issueDate

In [6]:
df1['Day_differ']=(df1.Day_differ/np.timedelta64(1,'D')).astype(int)
df1.Day_differ.describe()

count    17261.000000
mean      4284.689126
std       4357.315430
min     -17108.000000
25%       1857.000000
50%       3641.000000
75%       4395.000000
max      36723.000000
Name: Day_differ, dtype: float64

In [7]:
for column in ['market','collateralType','couponType','industryGroup','industrySector','industrySubgroup','maturityType','securityType','paymentRank','ratingAgency1Rating','ratingAgency1Watch','ratingAgency2Rating','ratingAgency2Watch']:
    dummies = pd.get_dummies(df1[column])
    df1[dummies.columns] = dummies

In [8]:
df1.drop(['issueDate','market','collateralType','couponType','industryGroup','industrySector','industrySubgroup','maturity','maturityType','securityType','paymentRank','ratingAgency1Rating','ratingAgency1Watch','ratingAgency2Rating','ratingAgency2Watch','ratingAgency1EffectiveDate','ratingAgency2EffectiveDate'], axis=1, inplace=True)

In [9]:
df1.rename(columns={'144aFlag': 'Flag'}, inplace=True)

In [10]:
df1.Flag.replace('flag0',0,inplace= 1)
df1.Flag.replace('flag1',1,inplace= 1)

In [11]:
df1.issuer.describe()

count          17261
unique          4045
top       issuer1677
freq             338
Name: issuer, dtype: object

In [12]:
df2.drop(['side','date','time'],axis=1, inplace=True)

In [13]:
df2['sellvolume']= np.sqrt(np.log(df2['sellvolume']))

In [14]:
df2.Day.unique()

array(['Fri', 'Wed', 'Thu', 'Mon', 'Tue'], dtype=object)

In [15]:
df2.Day.replace({'Mon' : 1, 'Tue' : 2, 'Wed': 3, 'Thu': 4, 'Fri': 5},inplace=1)

In [16]:
train_sell= pd.merge(df2, df1, how='left', on='id')

In [17]:
train_sell.issuer.describe()

count         176101
unique          2103
top       issuer1615
freq            5292
Name: issuer, dtype: object

In [25]:
# from sklearn import preprocessing
# le_Type = preprocessing.LabelEncoder()
# train_sell.issuer= le_Type.fit_transform(train_sell.issuer)

In [18]:
ID_col=['id']
target_col=['sellvolume']
cat_col=['issuer']
features=list(set(list(train_sell.columns))-set(ID_col)-set(target_col)-set(cat_col))

In [19]:
test_10= pd.read_csv("G:/Goldman Sachs/Bond Liquidity Prediction/test_10jun2016.csv")
test_13=pd.read_csv("G:/Goldman Sachs/Bond Liquidity Prediction/test_13jun2016.csv")
test_14=pd.read_csv("G:/Goldman Sachs/Bond Liquidity Prediction/test_14jun2016.csv")

In [20]:
test_13.issuer.describe()

count          17261
unique          4045
top       issuer1677
freq             338
Name: issuer, dtype: object

In [21]:
cv=[]
lst=[]
kf=cross_validation.KFold(len(train_sell),n_folds=3,random_state=0)
for idx1,idx2 in kf:
    x_train,x_cv=train_sell[features].iloc[idx1],train_sell[features].iloc[idx2]
    y_train,y_cv=train_sell.sellvolume.iloc[idx1],train_sell.sellvolume.iloc[idx2]
    random.seed(100)
    rf =GradientBoostingRegressor()
    rf.fit(x_train, y_train)
    cv.extend(rf.predict(x_cv))
    lst.append(rf.predict(test_10[features]))
    print(metrics.mean_squared_error(rf.predict(x_cv),y_cv))

In [22]:
pred1_10jun_sell=pd.DataFrame({'id': test_10.id, 'sellvolume_10':np.average(lst,axis=0)})
pred1_10jun_sell_cv=pd.DataFrame({'id': train_sell.id,'sellvolume_10': cv})

In [23]:
pred1_10jun_sell.sellvolume_10=np.exp(np.power(pred1_10jun_sell.sellvolume_10,2))
pred1_10jun_sell.to_csv("C:/Users/Rajat/Desktop/AD_10jun_sellvolume.csv",index= False)

In [24]:
pred1_10jun_sell_cv.sellvolume_10=np.exp(np.power(pred1_10jun_sell_cv.sellvolume_10,2))
pred1_10jun_sell_cv.to_csv("C:/Users/Rajat/Desktop/AD_10jun_sellvolume_cv.csv",index= False)