#Applying logistic regression on Kaggle data

In [66]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import train_test_split

In [67]:
ion_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [68]:
submission = pd.read_csv('data/sample_submission.csv')

In [69]:
submission.head()

Unnamed: 0,time,open_channels
0,500.0001,0
1,500.0002,0
2,500.0003,0
3,500.0004,0
4,500.0005,0


In [70]:
ion_data.head()

Unnamed: 0,time,signal,open_channels
0,0.0001,-2.76,0
1,0.0002,-2.8557,0
2,0.0003,-2.4074,0
3,0.0004,-3.1404,0
4,0.0005,-3.1525,0


In [71]:
test_data.head()

Unnamed: 0,time,signal
0,500.0001,-2.6498
1,500.0002,-2.8494
2,500.0003,-2.86
3,500.0004,-2.435
4,500.0005,-2.6155


In [72]:
#features for training data

#create a batch 
ion_data['batch'] = 0
for i in range(0,10):
    ion_data.iloc[i*500000:500000*(i+1),3] = i
    
#rolling mean
ion_data['simple_moving_avg_50'] = ion_data['signal'].rolling(window=50).mean() 

#rolling standard deviation of signal
ion_data['rolling_std_50'] = ion_data['signal'].rolling(window=50).std()

#create new column moving avg in each batch
ion_data['simple_moving_avg_50_batch'] = ion_data.groupby(['batch']).signal.rolling(window=50).mean().values

#create new column moving standard deviation in each batch
ion_data['moving_std_50_batch'] = ion_data.groupby(['batch']).signal.rolling(window=50).std().values

#replace nan's with zero's in dataframe
ion_data = ion_data.fillna(0)

#save the data with features
ion_data.to_csv('data/ion_data.csv',index=False)

In [73]:
#features for test data

#create a batch 
test_data['batch'] = 0
for i in range(0,10):
    test_data.iloc[i*500000:500000*(i+1),2] = i
    
#rolling mean
test_data['simple_moving_avg_50'] = test_data['signal'].rolling(window=50).mean() 

#rolling standard deviation of signal
test_data['rolling_std_50'] = test_data['signal'].rolling(window=50).std()

#create new column moving avg in each batch
test_data['simple_moving_avg_50_batch'] = test_data.groupby(['batch']).signal.rolling(window=50).mean().values

#create new column moving standard deviation in each batch
test_data['moving_std_50_batch'] = test_data.groupby(['batch']).signal.rolling(window=50).std().values

#replace nan's with zero's in dataframe
test_data = test_data.fillna(0)

#test_Data with features
test_data.to_csv('data/test_data_with_features.csv',index=False)

In [74]:
 test_data.tail()

Unnamed: 0,time,signal,batch,simple_moving_avg_50,rolling_std_50,simple_moving_avg_50_batch,moving_std_50_batch
1999995,699.9996,-2.9092,3,-2.766912,0.227497,-2.766912,0.227497
1999996,699.9997,-2.7422,3,-2.772244,0.223622,-2.772244,0.223622
1999997,699.9998,-2.8285,3,-2.780104,0.218389,-2.780104,0.218389
1999998,699.9999,-2.9092,3,-2.7871,0.216772,-2.7871,0.216772
1999999,700.0,-2.7422,3,-2.788486,0.216248,-2.788486,0.216248


Applying Logistic Regression

In [75]:
 # Train multinomial logistic regression
  #  mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(train_x, train_y)

#all features in x and dependent variable in y
X =  ion_data.drop(['time','signal','open_channels','batch'],axis=1)
y = ion_data['open_channels']

#Create training and test test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3, random_state = 42)

#instantiate the model
logreg = LogisticRegression(multi_class='multinomial', solver='newton-cg')

#Train the model

logreg.fit(X_train,y_train)

#predict using the test data which you split 
y_pred = logreg.predict(X_test)

#see the confusion matrix
print(confusion_matrix(y_test,y_pred))

#see the classification report
print(classification_report(y_test,y_pred))

[[323220  38719   3196   6185    164      0    115    339     10      0
       0]
 [133832 118250  15105  24141   1066      5    836   2562    233      0
       0]
 [   220  67304  28700  54782   5546      0   1621   6272   1615      0
       0]
 [     8  64596  28234  79928  15273      0   1291   7790   3641      0
       0]
 [     0   7555  11379  68317  19960      1    893   7383   5471      0
       0]
 [     0   3518   4966  39426  11771     20   1459  14146   7985      0
       0]
 [     0      3   2334  12963   3446      0   1640  22678  13327      0
       0]
 [     0      3   1325  18385   7427      0   1559  28928  21729      0
       0]
 [     0      2    419  14510   9482      0   1066  25553  22559      0
       0]
 [     0      0     51   5554   6985      0    490  14272  13560      3
       1]
 [     0      0      2    745   2242      3    107   3754   3808      5
      31]]
              precision    recall  f1-score   support

           0       0.71      0.87      0.7

In [76]:
#lets apply the model to predict the values for kaggles test data
kaggle_test_data=test_data.iloc[:,3:]

#predict for kaggle data
kaggle_y_pred = logreg.predict(kaggle_test_data)

In [77]:
kaggle_y_pred=pd.DataFrame(kaggle_y_pred )

In [78]:
kaggle_y_pred[0].unique()

array([ 0,  1,  2,  3,  6,  7,  5, 10,  4,  8])

In [79]:
submission3 =pd.DataFrame()
submission3['time'] = test_data.time
submission3['open_channels'] = kaggle_y_pred.astype(int)

submission3.time = submission3.time.apply(lambda x: '{:.4f}'.format(x))
submission3.to_csv('data/logistic_submission3.csv',index=False)

In [80]:
submission3.head()

Unnamed: 0,time,open_channels
0,500.0001,0
1,500.0002,0
2,500.0003,0
3,500.0004,0
4,500.0005,0


In [None]:
 print "Multinomial Logistic regression Test Accuracy :: ", metrics.accuracy_score(test_y, mul_lr.predict(test_x))