In [374]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [375]:
data = pd.read_csv('train.csv')
x = data.iloc[:,1:5]
y = data.iloc[:,5]
test1 = pd.read_csv('test.csv')
test=test1.iloc[:,1:]
test1.head()

Unnamed: 0.1,Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation
0,659,2,12,3000,52
1,276,21,7,1750,38
2,263,4,1,250,4
3,303,11,11,2750,38
4,83,4,12,3000,34


In [376]:
merged_data = pd.concat([x, test])
merged_data.head()

Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation
0,2,50,12500,98
1,0,13,3250,28
2,1,16,4000,35
3,2,20,5000,45
4,1,24,6000,77


In [377]:
merged_data['average_donation'] = (merged_data['Months since First Donation'] / merged_data['Number of Donations'])

In [378]:
merged_data['time_between_donation'] = ((merged_data['Months since First Donation'] - merged_data['Months since Last Donation']) / merged_data['Number of Donations'])

In [379]:
train_data = merged_data[:len(x)]
test_data = merged_data[len(x):]

test_data

Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation,average_donation,time_between_donation
0,2,12,3000,52,4.333333,4.166667
1,21,7,1750,38,5.428571,2.428571
2,4,1,250,4,4.000000,0.000000
3,11,11,2750,38,3.454545,2.454545
4,4,12,3000,34,2.833333,2.500000
5,3,21,5250,42,2.000000,1.857143
6,4,2,500,4,2.000000,0.000000
7,14,1,250,14,14.000000,0.000000
8,23,2,500,87,43.500000,32.000000
9,14,4,1000,64,16.000000,12.500000


In [380]:
scaler = StandardScaler()

train_data =  scaler.fit_transform(train_data)

train_data


array([[-0.91073873,  7.77205216,  7.77205216,  2.6418237 , -0.91236291,
        -0.51660142],
       [-1.15558611,  1.32046785,  1.32046785, -0.24994586, -0.88601052,
        -0.46698789],
       [-1.03316242,  1.84356928,  1.84356928,  0.0392311 , -0.88143545,
        -0.47310798],
       ..., 
       [ 1.41531137, -0.59757073, -0.59757073,  0.74151799,  2.35575248,
         2.36457434],
       [ 3.61893779, -0.77193788, -0.77193788,  0.20447507,  4.12303618,
        -0.9239547 ],
       [ 7.65891955, -0.77193788, -0.77193788,  1.56773787,  8.60921788,
        -0.9239547 ]])

In [381]:
x_train, x_test, y_train, y_test = train_test_split(train_data, y, test_size = 0.20)

In [382]:
lg = LogisticRegression()

In [383]:
lg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [384]:
score =cross_val_score(lg, x_train, y_train, cv=10)
score

array([ 0.74468085,  0.80851064,  0.80851064,  0.74468085,  0.76086957,
        0.76086957,  0.77777778,  0.71111111,  0.73333333,  0.73333333])

In [385]:
predict = lg.predict(x_train)
print ('Logistic Regression - Entropy Loss: ', log_loss(y_train, predict))

Logistic Regression - Entropy Loss:  8.33437603349


In [386]:
print("Accuracy: %1.3f (+/- %1.3f)" % (score.mean(), score.std()*2))

Accuracy: 0.758 (+/- 0.061)


In [387]:
y_pred = lg.predict_proba(test_data)

In [388]:
logregr = LogisticRegressionCV(cv = 5, random_state=12, scoring ='neg_log_loss')
logregr = logregr.fit(x_train, y_train)

y_cv_logregr = logregr.predict_proba(x_test)
print ('Logistic Regression - Entropy Loss: ', log_loss(y_test, y_cv_logregr))

Logistic Regression - Entropy Loss:  0.454553170602


In [396]:
y_cv_logregr = logregr.predict_proba(test_data)

In [397]:
#Random Forest

random = RandomForestClassifier(n_estimators=1000)
random.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [398]:
scores =cross_val_score(lg, x_train, y_train, cv=10)
scores

array([ 0.74468085,  0.80851064,  0.80851064,  0.74468085,  0.76086957,
        0.76086957,  0.77777778,  0.71111111,  0.73333333,  0.73333333])

In [399]:
prediction = random.predict_proba(test_data)

print("Accuracy: %1.3f (+/- %1.3f)" % (score.mean(), score.std()*2))

Accuracy: 0.758 (+/- 0.061)


In [400]:
rmloss = lg.predict(x_train)
print ('RM - Entropy Loss: ', log_loss(y_train, rmloss))

RM - Entropy Loss:  8.33437603349
