In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss


In [10]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv('test.csv')

In [11]:
train_df.describe().T

Unnamed: 0,count,unique,top,freq
discourse_id,36765,36765,0013cc385424,1
essay_id,36765,4191,91B1F82B2CF1,23
discourse_text,36765,36691,Summer projects should be student-designed,14
discourse_type,36765,7,Evidence,12105
discourse_effectiveness,36765,3,Adequate,20977


In [4]:
dem = {'Adequate':0, 'Ineffective':1, 'Effective':2}
train_df['target'] = train_df['discourse_effectiveness'].map(dem)

In [5]:
train_df.head(5)

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,target
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,0
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,0
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,0
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,0
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,0


In [6]:
train_df.drop(labels = ('discourse_effectiveness'), 
              axis=1)
train_df.reset_index(drop = True)
train_df.head(2)

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,target
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,0
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,0


In [7]:
n_folds = 5
skf = StratifiedKFold(n_splits= n_folds, shuffle= True, random_state= 42)
for i, (train_index, test_index) in enumerate(skf.split(train_df, train_df["target"])):
    train_df.loc[test_index,"fold"] = i
print(train_df.fold.value_counts())   

0.0    7353
1.0    7353
2.0    7353
4.0    7353
3.0    7353
Name: fold, dtype: int64


In [8]:
preds = []

In [9]:
for n_fold in range(n_folds):
    train = train_df[train_df['fold']!=n_fold]
    val = train_df[train_df['fold']==n_fold] 
    
    tf = TfidfVectorizer(ngram_range=(1,2),norm='l2', smooth_idf=True)
    tr_discourse_tfidf = tf.fit_transform(train["discourse_text"])
    eval_discourse_tfidf = tf.transform(val["discourse_text"])
    te_discourse_tfidf = tf.transform(test_df["discourse_text"])
    
    #text
    #tf = TfidfVectorizer(ngram_range=(1,2),norm='l2', smooth_idf=True) # Load tf another time because it will learn the new vocabulary for 'text'
    #tr_text_tfidf = tf.fit_transform(train["text"])
    #eval_text_tfidf = tf.transform(val["text"])
    #te_text_tfidf = tf.transform(test_df["text"])
    
    #discourse_type
    ohe = OneHotEncoder()
    tr_type_ohe =  sparse.csr_matrix(ohe.fit_transform(train["discourse_type"].values.reshape(-1,1)))
    eval_type_ohe =  sparse.csr_matrix(ohe.transform(val["discourse_type"].values.reshape(-1,1)))
    te_type_ohe =  sparse.csr_matrix(ohe.transform(test_df["discourse_type"].values.reshape(-1,1)))
        
    #Stack each vector representations 
    tr_tfidf = sparse.hstack((tr_type_ohe,tr_discourse_tfidf))
    eval_tfidf = sparse.hstack((eval_type_ohe,eval_discourse_tfidf))
    te_tfidf = sparse.hstack((te_type_ohe,te_discourse_tfidf))
    
    #Model
    #clf = LogisticRegression(max_iter=500,penalty="l2",C=1.0131816333513533)
    #clf.fit(tr_tfidf, train["target"].values)
    
    clf = LGBMClassifier()
    clf.fit(tr_tfidf, train["target"].values)
    
    #Validation 
    ev_preds = clf.predict_proba(eval_tfidf)
    ev_loss = log_loss(val["target"].values,ev_preds)
    print("Fold : {} EV score: {}".format(n_fold,ev_loss))
    
    #Test
    preds.append(clf.predict_proba(te_tfidf))



Fold : 0 EV score: 0.759145243252325




Fold : 1 EV score: 0.7501418235171594




Fold : 2 EV score: 0.7771120754809966




Fold : 3 EV score: 0.7768124629930829




Fold : 4 EV score: 0.7634138419389856




In [1]:
submission = pd.read_csv("Effective-Arguments/sample_submission.csv")
submission

NameError: name 'pd' is not defined

In [11]:
predictions = np.array(preds).mean(0)
submission.loc[:,"Ineffective"] = predictions[:,0]
submission.loc[:,"Adequate"] = predictions[:,1]
submission.loc[:,"Effective"] = predictions[:,2]
submission.head()

Unnamed: 0,discourse_id,Ineffective,Adequate,Effective
0,a261b6e14276,0.251141,0.015476,0.733382
1,5a88900e7dc1,0.673514,0.047944,0.278542
2,9790d835736b,0.623609,0.074768,0.301623
3,75ce6d68b67b,0.495991,0.138563,0.365446
4,93578d946723,0.500776,0.095289,0.403935


In [12]:
submission.to_csv("submission.csv", index=False)