In [54]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

In [55]:
train_data=pd.read_csv('train.csv')
test_data=pd.read_csv('test.csv')

In [56]:
train_data.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [57]:
test_data.head()

Unnamed: 0,qid,question_text
0,00014894849d00ba98a9,My voice range is A2-C5. My chest voice goes u...
1,000156468431f09b3cae,How much does a tutor earn in Bangalore?
2,000227734433360e1aae,What are the best made pocket knives under $20...
3,0005e06fbe3045bd2a92,Why would they add a hypothetical scenario tha...
4,00068a0f7f41f50fc399,What is the dresscode for Techmahindra freshers?


The TfidfVectorizer converts the strings into a feature vector.

In [58]:
vec = TfidfVectorizer(ngram_range=(1,3), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,smooth_idf=1,sublinear_tf=1,analyzer='word')

We use questions from both the training and the testing set, to build our vectorizer.

In [59]:
vec.fit_transform(pd.concat([train_data['question_text'],test_data['question_text']]))

<1362492x1165112 sparse matrix of type '<class 'numpy.float64'>'
	with 33866195 stored elements in Compressed Sparse Row format>

In [60]:
fvec_train=vec.transform(train_data['question_text'])
fvec_test=vec.transform(test_data['question_text'])


This is a model which uses the feature vector to create a Naive Bayes input to the Logistic Regression model.

In [61]:
def createNBx(f,y):
    alpha=1
    p=alpha+f[y==1.0].sum(0)
    q=alpha+f[y==0.0].sum(0)
    r=np.log(np.divide(p/p.sum(1),q/q.sum(1)))
    x=f.multiply(r)
    return x

In [62]:
x_train=createNBx(fvec_train,train_data['target'].values)

In [63]:
x_train=x_train.tocsr()

In [64]:
model=LogisticRegression(C=2)

In [65]:
rkf=StratifiedKFold(n_splits=5,random_state=42,shuffle=True)

In [None]:
  for train_idx, val_idx in rkf.split(x_train,train_data['target']):
        x_train_sub, x_val = x_train[train_idx], x_train[val_idx]
        y_train_sub, y_val = train_data['target'][train_idx], train_data['target'][val_idx]
        model.fit(x_train_sub, y_train_sub)
        y_pred=model.predict(x_val)
        print("Log_loss: ",log_loss(y_val,y_pred),"Accuracy: ",accuracy_score(y_val, y_pred),"\n")
        
        



Log_loss:  1.5104714991323127 Accuracy:  0.9562675854148722 





Log_loss:  1.5098104984851628 Accuracy:  0.9562867260024883 





Log_loss:  1.5050563146075275 Accuracy:  0.9564243714206964 





Log_loss:  1.4992386846723709 Accuracy:  0.9565928092365173 



