# Experimentation

In [1]:
#imports
import pandas as pd

# transformer
from sklearn.base import BaseEstimator, TransformerMixin

# pipeline
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\robin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
#load the data
toefl = pd.read_csv('Google Drive/NLP Shared task/toefl11_trainingdata_features')

FileNotFoundError: [Errno 2] No such file or directory: 'Google Drive/NLP Shared task/toefl11_trainingdata_features'

In [3]:
toefl = pd.read_csv('toefl11_trainingdata_features')
toefl = toefl.drop(['Unnamed: 0','Unnamed: 0.1'], axis= 1)

In [4]:
#Pipeline
#define features and y 
x = toefl.drop(['Filename', 'Language', 'Proficiency'], axis= 1)
y = toefl.drop(['Filename','text','POS','DEP', 'NER', 'prop_punct', 'prop_capwords', 'prop_capI','avg_sentlength'], axis = 1)


In [5]:
#define custom tranformer
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, data_dict):
        return data_dict[self.key]

## Proficiency

In [10]:
#Pipeline
#define features and y 
x = toefl.drop(['Filename', 'Language', 'Proficiency', 'NER', 'prop_capwords'], axis= 1)
y = toefl.drop(['Filename','text','POS','DEP', 'NER', 'prop_punct', 'prop_capwords', 'prop_capI','avg_sentlength'], axis = 1)

#feature simplification
#- NER
# - prop_capwords

In [8]:
numeric_features = ['prop_punct', 'avg_sentlength', 'prop_capI']
tfvect = CountVectorizer()
tfidfvect = TfidfVectorizer()
svm = SVC()
linsvm = LinearSVC()
#refit with best ranges
numfeat_pipe = Pipeline([
    ('selector', ItemSelector(key=numeric_features))
])

# pipe word n-grams
text_pipe = Pipeline([
    ('selector', ItemSelector(key='text')),
    ('vect', TfidfVectorizer(
        tokenizer=word_tokenize,
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

# pipe char n-grams
char_pipe = Pipeline([
    ('selector', ItemSelector(key="text")),
    ('vect', TfidfVectorizer(
        analyzer='char',
        lowercase=True,
        ngram_range=(2,3)))
])

# pipe POS n-grams
pos_pipe = Pipeline([
    ('selector', ItemSelector(key="POS")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,4)))
])

# pipe complete
pipe = Pipeline([
    ('feats', FeatureUnion([
        ('numfeat_pipe', numfeat_pipe),
        ('text_pipe', text_pipe),
        ('char_pipe', char_pipe),
        ('pos_pipe', pos_pipe)
    ])),
    ('cls', linsvm)
])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(x, y['Proficiency'], test_size= 0.20,, random_state=11)

In [12]:
pipe.fit(X_train, y_train)



Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('numfeat_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key=['prop_punct',
                                                                                    'avg_sentlength',
                                                                                    'prop_capI']))])),
                                                ('text_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                                                                  TfidfVectorizer(ngram_range=(1,
                                                                                               2),
                        

In [13]:
# prediction
y_test_pred = pipe.predict(X_test)

# accuracy
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

        high       0.66      0.79      0.72       783
         low       0.73      0.33      0.46       239
      medium       0.72      0.71      0.71      1178

    accuracy                           0.70      2200
   macro avg       0.70      0.61      0.63      2200
weighted avg       0.70      0.70      0.69      2200



### -avg sent_length

In [14]:
x = toefl.drop(['Filename', 'Language', 'Proficiency', 'NER', 'prop_capwords','avg_sentlength'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(x, y['Proficiency'], test_size= 0.20,, random_state=11)
# dropping avg_sent lenght as well

In [15]:
numeric_features = ['prop_punct', 'prop_capI']
tfvect = CountVectorizer()
tfidfvect = TfidfVectorizer()
svm = SVC()
linsvm = LinearSVC()
#refit with best ranges
numfeat_pipe = Pipeline([
    ('selector', ItemSelector(key=numeric_features))
])

# pipe word n-grams
text_pipe = Pipeline([
    ('selector', ItemSelector(key='text')),
    ('vect', TfidfVectorizer(
        tokenizer=word_tokenize,
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

# pipe char n-grams
char_pipe = Pipeline([
    ('selector', ItemSelector(key="text")),
    ('vect', TfidfVectorizer(
        analyzer='char',
        lowercase=True,
        ngram_range=(2,3)))
])

# pipe POS n-grams
pos_pipe = Pipeline([
    ('selector', ItemSelector(key="POS")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,4)))
])

# pipe complete
pipe = Pipeline([
    ('feats', FeatureUnion([
        ('numfeat_pipe', numfeat_pipe),
        ('text_pipe', text_pipe),
        ('char_pipe', char_pipe),
        ('pos_pipe', pos_pipe)
    ])),
    ('cls', linsvm)
])

In [16]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('numfeat_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key=['prop_punct',
                                                                                    'prop_capI']))])),
                                                ('text_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                                                                  TfidfVectorizer(ngram_range=(1,
                                                                                               2),
                                                                                  tokenizer=<function word_tokenize at 0x00000

In [17]:
# prediction
y_test_pred = pipe.predict(X_test)

# accuracy
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

        high       0.66      0.79      0.72       761
         low       0.82      0.36      0.50       221
      medium       0.75      0.73      0.74      1218

    accuracy                           0.72      2200
   macro avg       0.74      0.63      0.66      2200
weighted avg       0.73      0.72      0.71      2200



### -numeric features

In [19]:
x = toefl.drop(['Filename', 'Language', 'Proficiency', 'NER', 'prop_capwords','prop_capI','prop_punct','avg_sentlength'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(x, y['Proficiency'], test_size= 0.20, random_state=11)
# dropping all numeric features + NER

In [22]:
linsvm = LinearSVC()
#refit with best ranges

# pipe word n-grams
text_pipe = Pipeline([
    ('selector', ItemSelector(key='text')),
    ('vect', TfidfVectorizer(
        tokenizer=word_tokenize,
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

# pipe char n-grams
char_pipe = Pipeline([
    ('selector', ItemSelector(key="text")),
    ('vect', TfidfVectorizer(
        analyzer='char',
        lowercase=True,
        ngram_range=(2,3)))
])

# pipe POS n-grams
pos_pipe = Pipeline([
    ('selector', ItemSelector(key="POS")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,4)))
])

# pipe complete
pipe = Pipeline([
    ('feats', FeatureUnion([
        ('text_pipe', text_pipe),
        ('char_pipe', char_pipe),
        ('pos_pipe', pos_pipe)
    ])),
    ('cls', linsvm)
])

In [23]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('text_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                                                                  TfidfVectorizer(ngram_range=(1,
                                                                                               2),
                                                                                  tokenizer=<function word_tokenize at 0x000001F51C7AFA60>))])),
                                                ('char_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                  

In [24]:
# prediction
y_test_pred = pipe.predict(X_test)

# accuracy
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

        high       0.66      0.80      0.72       765
         low       0.80      0.34      0.48       238
      medium       0.74      0.72      0.73      1197

    accuracy                           0.71      2200
   macro avg       0.73      0.62      0.64      2200
weighted avg       0.72      0.71      0.70      2200



## NLI experimentation

### baseline

In [28]:
#define features and y 
x = toefl.drop(['Filename', 'Language', 'Proficiency'], axis= 1)
y = toefl.drop(['Filename','text','POS','DEP', 'NER', 'prop_punct', 'prop_capwords', 'prop_capI','avg_sentlength'], axis = 1)


In [29]:
numeric_features = ['prop_punct', 'avg_sentlength', 'prop_capwords', 'prop_capI']
tfidfvect = TfidfVectorizer()
linsvm = LinearSVC()
#refit with best ranges
numfeat_pipe = Pipeline([
    ('selector', ItemSelector(key=numeric_features))
])

# pipe word n-grams
text_pipe = Pipeline([
    ('selector', ItemSelector(key='text')),
    ('vect', TfidfVectorizer(
        tokenizer=word_tokenize,
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

# pipe char n-grams
char_pipe = Pipeline([
    ('selector', ItemSelector(key="text")),
    ('vect', TfidfVectorizer(
        analyzer='char',
        lowercase=True,
        ngram_range=(2,4)))
])

# pipe POS n-grams
pos_pipe = Pipeline([
    ('selector', ItemSelector(key="POS")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,3)))
])

# pipe complete
pipe = Pipeline([
    ('feats', FeatureUnion([
        ('numfeat_pipe', numfeat_pipe),
        ('text_pipe', text_pipe),
        ('char_pipe', char_pipe),
        ('pos_pipe', pos_pipe)
    ])),
    ('cls', linsvm)
])

In [30]:
X_train, X_test, y_train, y_test = train_test_split(x, y['Language'], test_size= 0.20, random_state=11)
# fit model
pipe.fit(X_train, y_train)



Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('numfeat_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key=['prop_punct',
                                                                                    'avg_sentlength',
                                                                                    'prop_capwords',
                                                                                    'prop_capI']))])),
                                                ('text_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                                                                  TfidfVectorizer(ngram_range=(1,
                      

In [31]:
# prediction
y_test_pred = pipe.predict(X_test)

# accuracy
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         ARA       0.83      0.76      0.79       188
         DEU       0.88      0.91      0.89       193
         FRA       0.83      0.77      0.80       198
         HIN       0.68      0.69      0.68       214
         ITA       0.89      0.83      0.86       199
         JPN       0.77      0.74      0.76       193
         KOR       0.74      0.79      0.76       203
         SPA       0.74      0.78      0.76       200
         TEL       0.71      0.74      0.72       192
         TUR       0.83      0.84      0.84       208
         ZHO       0.84      0.85      0.84       212

    accuracy                           0.79      2200
   macro avg       0.79      0.79      0.79      2200
weighted avg       0.79      0.79      0.79      2200



### Dropping NER & prop capwords

In [13]:
#Pipeline
#define features and y 
x = toefl.drop(['Filename', 'Language', 'Proficiency', 'NER', 'prop_capwords'], axis= 1)
#feature simplification
#- NER
# - prop_capwords

In [14]:
numeric_features = ['prop_punct', 'avg_sentlength', 'prop_capI']
tfidfvect = TfidfVectorizer()
linsvm = LinearSVC()
#refit with best ranges
numfeat_pipe = Pipeline([
    ('selector', ItemSelector(key=numeric_features))
])

# pipe word n-grams
text_pipe = Pipeline([
    ('selector', ItemSelector(key='text')),
    ('vect', TfidfVectorizer(
        tokenizer=word_tokenize,
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

# pipe char n-grams
char_pipe = Pipeline([
    ('selector', ItemSelector(key="text")),
    ('vect', TfidfVectorizer(
        analyzer='char',
        lowercase=True,
        ngram_range=(2,4)))
])

# pipe POS n-grams
pos_pipe = Pipeline([
    ('selector', ItemSelector(key="POS")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,3)))
])

# pipe complete
pipe = Pipeline([
    ('feats', FeatureUnion([
        ('numfeat_pipe', numfeat_pipe),
        ('text_pipe', text_pipe),
        ('char_pipe', char_pipe),
        ('pos_pipe', pos_pipe)
    ])),
    ('cls', linsvm)
])

In [25]:
X_train, X_test, y_train, y_test = train_test_split(x, y['Language'], test_size= 0.20, random_state=11)
# fit model
pipe.fit(X_train, y_train)

Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('text_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                                                                  TfidfVectorizer(ngram_range=(1,
                                                                                               2),
                                                                                  tokenizer=<function word_tokenize at 0x0000022F4005F8B0>))])),
                                                ('char_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                  

In [16]:
# prediction
y_test_pred = pipe.predict(X_test)

# accuracy
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         ARA       0.85      0.76      0.80       196
         DEU       0.86      0.92      0.89       194
         FRA       0.81      0.81      0.81       206
         HIN       0.69      0.77      0.73       192
         ITA       0.84      0.85      0.85       218
         JPN       0.84      0.78      0.80       196
         KOR       0.76      0.79      0.78       188
         SPA       0.74      0.77      0.76       203
         TEL       0.79      0.74      0.76       204
         TUR       0.89      0.85      0.87       217
         ZHO       0.85      0.89      0.87       186

    accuracy                           0.81      2200
   macro avg       0.81      0.81      0.81      2200
weighted avg       0.81      0.81      0.81      2200



### -avg sent_length

In [17]:
x = toefl.drop(['Filename', 'Language', 'Proficiency', 'NER', 'prop_capwords','avg_sentlength'], axis = 1, random_state=11)
X_train, X_test, y_train, y_test = train_test_split(x, y['Language'], test_size= 0.20)
# dropping avg_sent lenght as well

In [18]:
numeric_features = ['prop_punct', 'prop_capI']
tfvect = CountVectorizer()
tfidfvect = TfidfVectorizer()
svm = SVC()
linsvm = LinearSVC()
#refit with best ranges
numfeat_pipe = Pipeline([
    ('selector', ItemSelector(key=numeric_features))
])

# pipe word n-grams
text_pipe = Pipeline([
    ('selector', ItemSelector(key='text')),
    ('vect', TfidfVectorizer(
        tokenizer=word_tokenize,
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

# pipe char n-grams
char_pipe = Pipeline([
    ('selector', ItemSelector(key="text")),
    ('vect', TfidfVectorizer(
        analyzer='char',
        lowercase=True,
        ngram_range=(2,3)))
])

# pipe POS n-grams
pos_pipe = Pipeline([
    ('selector', ItemSelector(key="POS")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,4)))
])

# pipe complete
pipe = Pipeline([
    ('feats', FeatureUnion([
        ('numfeat_pipe', numfeat_pipe),
        ('text_pipe', text_pipe),
        ('char_pipe', char_pipe),
        ('pos_pipe', pos_pipe)
    ])),
    ('cls', linsvm)
])

In [19]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('numfeat_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key=['prop_punct',
                                                                                    'prop_capI']))])),
                                                ('text_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                                                                  TfidfVectorizer(ngram_range=(1,
                                                                                               2),
                                                                                  tokenizer=<function word_tokenize at 0x00000

In [20]:
# prediction
y_test_pred = pipe.predict(X_test)

# accuracy
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         ARA       0.88      0.74      0.81       213
         DEU       0.86      0.92      0.89       200
         FRA       0.81      0.78      0.79       192
         HIN       0.70      0.74      0.72       200
         ITA       0.86      0.85      0.86       200
         JPN       0.86      0.73      0.79       188
         KOR       0.69      0.80      0.74       178
         SPA       0.73      0.75      0.74       204
         TEL       0.77      0.75      0.76       203
         TUR       0.84      0.88      0.86       205
         ZHO       0.84      0.85      0.85       217

    accuracy                           0.80      2200
   macro avg       0.80      0.80      0.80      2200
weighted avg       0.80      0.80      0.80      2200



### -numeric features

In [21]:
x = toefl.drop(['Filename', 'Language', 'Proficiency', 'NER', 'prop_capwords','prop_capI','prop_punct','avg_sentlength'], axis = 1, random_state=11)
X_train, X_test, y_train, y_test = train_test_split(x, y['Language'], test_size= 0.20)
# dropping all numeric features + NER

In [22]:
linsvm = LinearSVC()
#refit with best ranges

# pipe word n-grams
text_pipe = Pipeline([
    ('selector', ItemSelector(key='text')),
    ('vect', TfidfVectorizer(
        tokenizer=word_tokenize,
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

# pipe char n-grams
char_pipe = Pipeline([
    ('selector', ItemSelector(key="text")),
    ('vect', TfidfVectorizer(
        analyzer='char',
        lowercase=True,
        ngram_range=(2,3)))
])

# pipe POS n-grams
pos_pipe = Pipeline([
    ('selector', ItemSelector(key="POS")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,4)))
])

# pipe complete
pipe = Pipeline([
    ('feats', FeatureUnion([
        ('text_pipe', text_pipe),
        ('char_pipe', char_pipe),
        ('pos_pipe', pos_pipe)
    ])),
    ('cls', linsvm)
])

In [23]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('text_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                                                                  TfidfVectorizer(ngram_range=(1,
                                                                                               2),
                                                                                  tokenizer=<function word_tokenize at 0x0000022F4005F8B0>))])),
                                                ('char_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                  

In [24]:
# prediction
y_test_pred = pipe.predict(X_test)

# accuracy
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         ARA       0.86      0.78      0.82       198
         DEU       0.87      0.94      0.90       191
         FRA       0.86      0.84      0.85       206
         HIN       0.69      0.72      0.71       199
         ITA       0.85      0.84      0.85       212
         JPN       0.79      0.74      0.76       205
         KOR       0.73      0.76      0.74       182
         SPA       0.76      0.77      0.77       217
         TEL       0.78      0.75      0.76       216
         TUR       0.87      0.87      0.87       183
         ZHO       0.80      0.86      0.83       191

    accuracy                           0.80      2200
   macro avg       0.81      0.81      0.81      2200
weighted avg       0.81      0.80      0.80      2200



### only dropping NER

In [35]:
x = toefl.drop(['Filename', 'Language', 'Proficiency','NER'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(x, y['Language'], test_size= 0.20, random_state=11)
# only dropping NER

In [36]:
numeric_features = ['prop_punct', 'avg_sentlength', 'prop_capwords', 'prop_capI']
tfvect = CountVectorizer()
tfidfvect = TfidfVectorizer()
svm = SVC()
linsvm = LinearSVC()
#refit with best ranges
numfeat_pipe = Pipeline([
    ('selector', ItemSelector(key=numeric_features))
])

# pipe word n-grams
text_pipe = Pipeline([
    ('selector', ItemSelector(key='text')),
    ('vect', TfidfVectorizer(
        tokenizer=word_tokenize,
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

# pipe char n-grams
char_pipe = Pipeline([
    ('selector', ItemSelector(key="text")),
    ('vect', TfidfVectorizer(
        analyzer='char',
        lowercase=True,
        ngram_range=(2,3)))
])

# pipe POS n-grams
pos_pipe = Pipeline([
    ('selector', ItemSelector(key="POS")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,4)))
])

# pipe complete
pipe = Pipeline([
    ('feats', FeatureUnion([
        ('numfeat_pipe', numfeat_pipe),
        ('text_pipe', text_pipe),
        ('char_pipe', char_pipe),
        ('pos_pipe', pos_pipe)
    ])),
    ('cls', linsvm)
])

In [37]:
pipe.fit(X_train, y_train)



Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('numfeat_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key=['prop_punct',
                                                                                    'avg_sentlength',
                                                                                    'prop_capwords',
                                                                                    'prop_capI']))])),
                                                ('text_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                                                                  TfidfVectorizer(ngram_range=(1,
                      

In [38]:
# prediction
y_test_pred = pipe.predict(X_test)

# accuracy
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         ARA       0.81      0.73      0.77       188
         DEU       0.87      0.90      0.88       193
         FRA       0.86      0.79      0.82       198
         HIN       0.69      0.69      0.69       214
         ITA       0.86      0.86      0.86       199
         JPN       0.78      0.78      0.78       193
         KOR       0.78      0.79      0.78       203
         SPA       0.70      0.78      0.74       200
         TEL       0.71      0.76      0.73       192
         TUR       0.89      0.82      0.85       208
         ZHO       0.85      0.85      0.85       212

    accuracy                           0.79      2200
   macro avg       0.80      0.79      0.80      2200
weighted avg       0.80      0.79      0.80      2200



### Dropping prop capwords

In [39]:
#Pipeline
#define features and y 
x = toefl.drop(['Filename', 'Language', 'Proficiency', 'prop_capwords'], axis= 1)
#feature simplification
# - prop_capwords

In [40]:
numeric_features = ['prop_punct', 'avg_sentlength', 'prop_capI']
tfidfvect = TfidfVectorizer()
linsvm = LinearSVC()
#refit with best ranges
numfeat_pipe = Pipeline([
    ('selector', ItemSelector(key=numeric_features))
])

# pipe word n-grams
text_pipe = Pipeline([
    ('selector', ItemSelector(key='text')),
    ('vect', TfidfVectorizer(
        tokenizer=word_tokenize,
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

# pipe char n-grams
char_pipe = Pipeline([
    ('selector', ItemSelector(key="text")),
    ('vect', TfidfVectorizer(
        analyzer='char',
        lowercase=True,
        ngram_range=(2,4)))
])

# pipe POS n-grams
pos_pipe = Pipeline([
    ('selector', ItemSelector(key="POS")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,3)))
])

# pipe complete
pipe = Pipeline([
    ('feats', FeatureUnion([
        ('numfeat_pipe', numfeat_pipe),
        ('text_pipe', text_pipe),
        ('char_pipe', char_pipe),
        ('pos_pipe', pos_pipe)
    ])),
    ('cls', linsvm)
])

In [41]:
X_train, X_test, y_train, y_test = train_test_split(x, y['Language'], test_size= 0.20, random_state=11)
# fit model
pipe.fit(X_train, y_train)



Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('numfeat_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key=['prop_punct',
                                                                                    'avg_sentlength',
                                                                                    'prop_capI']))])),
                                                ('text_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                                                                  TfidfVectorizer(ngram_range=(1,
                                                                                               2),
                        

In [42]:
# prediction
y_test_pred = pipe.predict(X_test)

# accuracy
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         ARA       0.83      0.76      0.79       188
         DEU       0.88      0.90      0.89       193
         FRA       0.84      0.78      0.81       198
         HIN       0.66      0.73      0.69       214
         ITA       0.88      0.84      0.86       199
         JPN       0.77      0.76      0.76       193
         KOR       0.76      0.77      0.77       203
         SPA       0.74      0.78      0.76       200
         TEL       0.71      0.73      0.72       192
         TUR       0.88      0.82      0.85       208
         ZHO       0.83      0.86      0.85       212

    accuracy                           0.79      2200
   macro avg       0.80      0.79      0.79      2200
weighted avg       0.80      0.79      0.79      2200

