In [1]:
# load libraries

import pandas as pd
import numpy as np
import re
import datetime
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb

pd.set_option('display.max_colwidth',100)



In [2]:
#load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
# #load goal data
# train_goal = pd.read_csv('train_goal.csv')
# test_goal = pd.read_csv('test_goal.csv')

In [3]:
# convert unix time format
unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    train[x] = train[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))
    test[x] = test[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))

# Feature Engineering

In [4]:
cols_to_use = ['name','desc']
len_feats = ['name_len','desc_len']
count_feats = ['name_count','desc_count']

for i in np.arange(2):
    train[len_feats[i]] = train[cols_to_use[i]].apply(str).apply(len)
    test[len_feats[i]] = test[cols_to_use[i]].apply(str).apply(len)

In [5]:
train['name_count'] = train['name'].str.split().str.len()
train['desc_count'] = train['desc'].str.split().str.len()

test['name_count'] = test['name'].str.split().str.len()
test['desc_count'] = test['desc'].str.split().str.len()

In [6]:
train['keywords_len'] = train['keywords'].str.len()
train['keywords_count'] = train['keywords'].str.split('-').str.len()

test['keywords_len'] = test['keywords'].str.len()
test['keywords_count'] = test['keywords'].str.split('-').str.len()

In [7]:
# converting string variables to datetime
unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    train[x] = train[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))
    test[x] = test[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))

In [8]:
# there should be simpler way - might take longer
# creating list with time difference between 1) launched_at and created_at 2) deadline and launched_at

time1 = []
time3 = []
for i in np.arange(train.shape[0]):
    time1.append(np.round((train.loc[i, 'launched_at'] - train.loc[i, 'created_at']).total_seconds()).astype(int))
    time3.append(np.round((train.loc[i, 'deadline'] - train.loc[i, 'launched_at']).total_seconds()).astype(int))

In [9]:
train['time1'] = np.log(time1)
train['time3']= np.log(time3)

In [10]:
# for test data
time5 = []
time6 = []
for i in np.arange(test.shape[0]):
    time5.append(np.round((test.loc[i, 'launched_at'] - test.loc[i, 'created_at']).total_seconds()).astype(int))
    time6.append(np.round((test.loc[i, 'deadline'] - test.loc[i, 'launched_at']).total_seconds()).astype(int))

In [11]:
test['time1'] = np.log(time5)
test['time3'] = np.log(time6)

In [12]:
feat = ['disable_communication','country']

for x in feat:
    le = LabelEncoder()
    le.fit(list(train[x].values) + list(test[x].values))
    train[x] = le.transform(list(train[x]))
    test[x] = le.transform(list(test[x]))

In [134]:
# train = pd.merge(train, train_goal, on='project_id', how='outer')
# test = pd.merge(test, test_goal, on='project_id', how='outer')

In [13]:
train['goal'] = np.log1p(train['goal'])
test['goal'] = np.log1p(test['goal'])

# Text Cleaning

In [14]:
# creating a full list of descriptions from train and etst
kickdesc = pd.Series(train['desc'].tolist() + test['desc'].tolist()).astype(str)
kickname = pd.Series(train['name'].tolist() + test['name'].tolist()).astype(str)

In [15]:
# this function cleans punctuations, digits and irregular tabs. Then converts the sentences to lower
def desc_clean(word):
    p1 = re.sub(pattern='(\W+)|(\d+)|(\s+)',repl=' ',string=word)
    p1 = p1.lower()
    return p1

kickdesc = kickdesc.map(desc_clean)
kickname = kickname.map(desc_clean)

In [16]:
import nltk
nltk.data.path.append('C:\\Users\\user\\Anaconda3\\envs\\keras_tf\\Lib\\site-packages\\nltk_data')

In [17]:
stop = set(stopwords.words('english'))
kickdesc = [[x for x in x.split() if x not in stop] for x in kickdesc]

stemmer = SnowballStemmer(language='english')
kickdesc = [[stemmer.stem(x) for x in x] for x in kickdesc]

kickdesc = [[x for x in x if len(x) > 2] for x in kickdesc]

kickdesc = [' '.join(x) for x in kickdesc]

In [18]:
stop = set(stopwords.words('english'))
kickname = [[x for x in x.split() if x not in stop] for x in kickname]

stemmer = SnowballStemmer(language='english')
kickname = [[stemmer.stem(x) for x in x] for x in kickname]

kickname = [[x for x in x if len(x) > 2] for x in kickname]

kickname = [' '.join(x) for x in kickname]

In [19]:
# Due to memory error, limited the number of features to 650
cv = CountVectorizer(max_features=700)

In [20]:
alldesc = cv.fit_transform(kickdesc).todense()
allname = cv.fit_transform(kickname).todense()

In [21]:
#create a data frame
combine = pd.DataFrame(alldesc)
combine.rename(columns= lambda x: 'variable_'+ str(x), inplace=True)
combine1 = pd.DataFrame(allname)
combine1.rename(columns= lambda x: 'variable_'+ str(x), inplace=True)

In [22]:
#split the text features

train_text = combine[:train.shape[0]]
test_text = combine[train.shape[0]:]

train_text1 = combine1[:train.shape[0]]
test_text1 = combine1[train.shape[0]:]

test_text.reset_index(drop=True,inplace=True)
test_text1.reset_index(drop=True,inplace=True)

In [23]:
cols_to_use = ['name_len','desc_len','keywords_len','name_count','desc_count','keywords_count','time1','time3','goal']

In [24]:
target = train['final_status']

In [25]:
train = train.loc[:,cols_to_use]
test = test.loc[:,cols_to_use]

In [26]:
X_train = pd.concat([train, train_text,train_text1],axis=1)
X_test = pd.concat([test, test_text,test_text1],axis=1)

In [65]:
print(X_train.shape)
print(X_test.shape)

(108129, 1409)
(63465, 1409)


In [29]:
dtrain = xgb.DMatrix(data=X_train, label = target)
dtest = xgb.DMatrix(data=X_test)

In [30]:
params = {
    'objective':'binary:logistic',
    'eval_metric':'error',
    'eta':0.025,
    'max_depth':6,
    'subsample':0.7,
    'colsample_bytree':0.7,
    'min_child_weight':5
    
}

In [None]:
# You can probably get better accuracy with rounds > 1000. 
bst = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=40,nfold=5,verbose_eval=10)

In [31]:
bst_train = xgb.train(params, dtrain, num_boost_round=1500)

In [32]:
p_test = bst_train.predict(dtest)

In [36]:
sub = pd.DataFrame()
sub['project_id'] = test['project_id']
sub['final_status'] = p_test

In [37]:
sub['final_status'] = [1 if x > 0.5 else 0 for x in sub['final_status']]

In [38]:
sub.to_csv("xgb_with_python_feats.csv",index=False) #0.70

In [27]:
import lightgbm as lgb
train_data = lgb.Dataset(X_train.values, label=target.values)

In [29]:
#setting parameters for lightgbm
param = {'num_leaves':100, 'objective':'binary','max_depth':10,'learning_rate':0.05,'max_bin':500}
param['metric'] = ['accuracy']
#depth8,0.05,30
#10,0.05,100

In [30]:
#training our model using light gbm
num_round=2000
# start=datetime.now()
lgbm=lgb.train(param,train_data,num_round)
# stop=datetime.now()

In [31]:
y_pred = lgbm.predict(X_test)

In [32]:
test1 = pd.read_csv('test.csv')

In [33]:
sub = pd.DataFrame()
sub['project_id'] = test1['project_id']
sub['final_status'] = y_pred

In [34]:
sub['final_status'] = [1 if x > 0.5 else 0 for x in sub['final_status']]

In [35]:
sub.to_csv("lgbm_with_python_extrafeats3.csv",index=False) #0.70

In [159]:
#ensemble
y_ens = 0.6*y_pred + 0.4*p_test
sub = pd.DataFrame()
sub['project_id'] = test1['project_id']
sub['final_status'] = y_ens
sub['final_status'] = [1 if x > 0.5 else 0 for x in sub['final_status']]
sub.to_csv("lgbm_xgb_4.csv",index=False) #0.70

In [36]:
%pylab inline

import os
import numpy as np
import pandas as pd
from scipy.misc import imread
from sklearn.metrics import accuracy_score

import tensorflow as tf

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [88]:
import keras

Using TensorFlow backend.


In [89]:
# To stop potential randomness
seed = 128
rng = np.random.RandomState(seed)

In [98]:
train_x = X_train.as_matrix()

In [99]:
split_size = int(X_train.shape[0]*0.7)

train_x, val_x = train_x[:split_size], train_x[split_size:]
train_y, val_y = target.values[:split_size], target.values[split_size:]

In [120]:
from keras.models import Sequential
from keras.layers import Dense

In [124]:
def dense_to_one_hot(labels_dense, num_classes=10):
    """Convert class labels from scalars to one-hot vectors"""
    num_labels = labels_dense.shape[0]
    index_offset = numpy.arange(num_labels) * num_classes
    labels_one_hot = numpy.zeros((num_labels, num_classes))
    labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
    
    return labels_one_hot

def preproc(unclean_batch_x):
    """Convert values to range 0-1"""
    temp_batch = unclean_batch_x / unclean_batch_x.max()
    
    return temp_batch

def batch_creator(batch_size, dataset_length, dataset_name):
    """Create batch with random samples and return appropriate format"""
    batch_mask = rng.choice(dataset_length, batch_size)
    
    batch_x = eval(dataset_name + '_x')[[batch_mask]].reshape(-1, 784)
    batch_x = preproc(batch_x)
    
    if dataset_name == 'train':
        batch_y = eval(dataset_name).ix[batch_mask, 'label'].values
        batch_y = dense_to_one_hot(batch_y)
        
    return batch_x, batch_y

In [144]:
# number of neurons in each layer
input_num_units = 1409
hidden_num_units = 500
output_num_units = 2

# define placeholders, i.e. way to feed values to computational graph
x = tf.placeholder(tf.float32, [None, input_num_units])
y = tf.placeholder(tf.float32, [None, output_num_units])

# set remaining parameters
epochs = 5
batch_size = 128
learning_rate = 0.01

In [145]:
weights = {
    'hidden': tf.Variable(tf.random_normal([input_num_units, hidden_num_units], seed=seed)),
    'output': tf.Variable(tf.random_normal([hidden_num_units, output_num_units], seed=seed))
}

biases = {
    'hidden': tf.Variable(tf.random_normal([hidden_num_units], seed=seed)),
    'output': tf.Variable(tf.random_normal([output_num_units], seed=seed))
}

In [146]:
hidden_layer = tf.add(tf.matmul(x, weights['hidden']), biases['hidden'])
hidden_layer = tf.nn.relu(hidden_layer)

output_layer = tf.matmul(hidden_layer, weights['output']) + biases['output']

In [147]:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=output_layer,logits= y))

In [148]:
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

ValueError: No gradients provided for any variable, check your graph for ops that do not support gradients, between variables ["<tf.Variable 'Variable:0' shape=(1409, 800) dtype=float32_ref>", "<tf.Variable 'Variable_1:0' shape=(800, 2) dtype=float32_ref>", "<tf.Variable 'Variable_2:0' shape=(800,) dtype=float32_ref>", "<tf.Variable 'Variable_3:0' shape=(2,) dtype=float32_ref>", "<tf.Variable 'Variable_4:0' shape=(1409, 500) dtype=float32_ref>", "<tf.Variable 'Variable_5:0' shape=(500, 2) dtype=float32_ref>", "<tf.Variable 'Variable_6:0' shape=(500,) dtype=float32_ref>", "<tf.Variable 'Variable_7:0' shape=(2,) dtype=float32_ref>", "<tf.Variable 'Variable_8:0' shape=(1409, 1000) dtype=float32_ref>", "<tf.Variable 'Variable_9:0' shape=(1000, 2) dtype=float32_ref>", "<tf.Variable 'Variable_10:0' shape=(1000,) dtype=float32_ref>", "<tf.Variable 'Variable_11:0' shape=(2,) dtype=float32_ref>", "<tf.Variable 'Variable_12:0' shape=(1409, 1000) dtype=float32_ref>", "<tf.Variable 'Variable_13:0' shape=(1000, 2) dtype=float32_ref>", "<tf.Variable 'Variable_14:0' shape=(1000,) dtype=float32_ref>", "<tf.Variable 'Variable_15:0' shape=(2,) dtype=float32_ref>", "<tf.Variable 'Variable_16:0' shape=(1409, 1200) dtype=float32_ref>", "<tf.Variable 'Variable_17:0' shape=(1200, 2) dtype=float32_ref>", "<tf.Variable 'Variable_18:0' shape=(1200,) dtype=float32_ref>", "<tf.Variable 'Variable_19:0' shape=(2,) dtype=float32_ref>", "<tf.Variable 'Variable_20:0' shape=(1409, 1300) dtype=float32_ref>", "<tf.Variable 'Variable_21:0' shape=(1300, 2) dtype=float32_ref>", "<tf.Variable 'Variable_22:0' shape=(1300,) dtype=float32_ref>", "<tf.Variable 'Variable_23:0' shape=(2,) dtype=float32_ref>", "<tf.Variable 'Variable_24:0' shape=(1409, 1000) dtype=float32_ref>", "<tf.Variable 'Variable_25:0' shape=(1000, 2) dtype=float32_ref>", "<tf.Variable 'Variable_26:0' shape=(1000,) dtype=float32_ref>", "<tf.Variable 'Variable_27:0' shape=(2,) dtype=float32_ref>", "<tf.Variable 'dense_1/kernel:0' shape=(1409, 150) dtype=float32_ref>", "<tf.Variable 'dense_1/bias:0' shape=(150,) dtype=float32_ref>", "<tf.Variable 'dense_2/kernel:0' shape=(150, 2) dtype=float32_ref>", "<tf.Variable 'dense_2/bias:0' shape=(2,) dtype=float32_ref>", "<tf.Variable 'iterations:0' shape=() dtype=float32_ref>", "<tf.Variable 'lr:0' shape=() dtype=float32_ref>", "<tf.Variable 'beta_1:0' shape=() dtype=float32_ref>", "<tf.Variable 'beta_2:0' shape=() dtype=float32_ref>", "<tf.Variable 'decay:0' shape=() dtype=float32_ref>", "<tf.Variable 'dense_3/kernel:0' shape=(1409, 150) dtype=float32_ref>", "<tf.Variable 'dense_3/bias:0' shape=(150,) dtype=float32_ref>", "<tf.Variable 'dense_4/kernel:0' shape=(150, 2) dtype=float32_ref>", "<tf.Variable 'dense_4/bias:0' shape=(2,) dtype=float32_ref>", "<tf.Variable 'iterations_1:0' shape=() dtype=float32_ref>", "<tf.Variable 'lr_1:0' shape=() dtype=float32_ref>", "<tf.Variable 'beta_1_1:0' shape=() dtype=float32_ref>", "<tf.Variable 'beta_2_1:0' shape=() dtype=float32_ref>", "<tf.Variable 'decay_1:0' shape=() dtype=float32_ref>", "<tf.Variable 'dense_5/kernel:0' shape=(1409, 150) dtype=float32_ref>", "<tf.Variable 'dense_5/bias:0' shape=(150,) dtype=float32_ref>", "<tf.Variable 'dense_6/kernel:0' shape=(150, 1) dtype=float32_ref>", "<tf.Variable 'dense_6/bias:0' shape=(1,) dtype=float32_ref>", "<tf.Variable 'iterations_2:0' shape=() dtype=float32_ref>", "<tf.Variable 'lr_2:0' shape=() dtype=float32_ref>", "<tf.Variable 'beta_1_2:0' shape=() dtype=float32_ref>", "<tf.Variable 'beta_2_2:0' shape=() dtype=float32_ref>", "<tf.Variable 'decay_2:0' shape=() dtype=float32_ref>", "<tf.Variable 'dense_7/kernel:0' shape=(1409, 150) dtype=float32_ref>", "<tf.Variable 'dense_7/bias:0' shape=(150,) dtype=float32_ref>", "<tf.Variable 'dense_8/kernel:0' shape=(150, 2) dtype=float32_ref>", "<tf.Variable 'dense_8/bias:0' shape=(2,) dtype=float32_ref>", "<tf.Variable 'iterations_3:0' shape=() dtype=float32_ref>", "<tf.Variable 'lr_3:0' shape=() dtype=float32_ref>", "<tf.Variable 'beta_1_3:0' shape=() dtype=float32_ref>", "<tf.Variable 'beta_2_3:0' shape=() dtype=float32_ref>", "<tf.Variable 'decay_3:0' shape=() dtype=float32_ref>", "<tf.Variable 'dense_9/kernel:0' shape=(1409, 150) dtype=float32_ref>", "<tf.Variable 'dense_9/bias:0' shape=(150,) dtype=float32_ref>", "<tf.Variable 'dense_10/kernel:0' shape=(150, 2) dtype=float32_ref>", "<tf.Variable 'dense_10/bias:0' shape=(2,) dtype=float32_ref>", "<tf.Variable 'iterations_4:0' shape=() dtype=float32_ref>", "<tf.Variable 'lr_4:0' shape=() dtype=float32_ref>", "<tf.Variable 'beta_1_4:0' shape=() dtype=float32_ref>", "<tf.Variable 'beta_2_4:0' shape=() dtype=float32_ref>", "<tf.Variable 'decay_4:0' shape=() dtype=float32_ref>", "<tf.Variable 'Variable_28:0' shape=(1409, 500) dtype=float32_ref>", "<tf.Variable 'Variable_29:0' shape=(500, 10) dtype=float32_ref>", "<tf.Variable 'Variable_30:0' shape=(500,) dtype=float32_ref>", "<tf.Variable 'Variable_31:0' shape=(10,) dtype=float32_ref>", "<tf.Variable 'Variable_32:0' shape=(1409, 500) dtype=float32_ref>", "<tf.Variable 'Variable_33:0' shape=(500, 2) dtype=float32_ref>", "<tf.Variable 'Variable_34:0' shape=(500,) dtype=float32_ref>", "<tf.Variable 'Variable_35:0' shape=(2,) dtype=float32_ref>", "<tf.Variable 'Variable_36:0' shape=(1409, 500) dtype=float32_ref>", "<tf.Variable 'Variable_37:0' shape=(500, 2) dtype=float32_ref>", "<tf.Variable 'Variable_38:0' shape=(500,) dtype=float32_ref>", "<tf.Variable 'Variable_39:0' shape=(2,) dtype=float32_ref>"] and loss Tensor("Mean_31:0", shape=(), dtype=float32).

In [150]:
# define vars
input_num_units = 1409
hidden_num_units = 500
output_num_units = 2

epochs = 5
batch_size = 200

In [151]:
model = Sequential([
    Dense(output_dim=hidden_num_units, input_dim=input_num_units, activation='relu'),
        
    Dense(output_dim=output_num_units, input_dim=hidden_num_units, activation='softmax'),
])

  from ipykernel import kernelapp as app


In [152]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [153]:
trained_model = model.fit(train_x, train_y, nb_epoch=epochs, batch_size=batch_size, validation_data=(val_x, val_y))



ValueError: Error when checking target: expected dense_12 to have shape (None, 2) but got array with shape (75690, 1)