In [1]:
#imports
import pandas as pd
import nltk
nltk.download('punkt')

# transformer
from sklearn.base import BaseEstimator, TransformerMixin

# pipeline
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\robin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
#load the data
#toefl = pd.read_csv('Google Drive/NLP Shared task/toefl11_trainingdata_features') --> colab
toefl = pd.read_csv('toefl11_trainingdata_features')
toefl = toefl.drop(['Unnamed: 0','Unnamed: 0.1'], axis= 1)

In [12]:
#Pipeline
#define features and y 
x = toefl.drop(['Filename', 'Language', 'Proficiency'], axis= 1)
y = toefl.drop(['Filename','text','POS','DEP', 'NER', 'prop_punct', 'prop_capwords', 'prop_capI','avg_sentlength'], axis = 1)


In [13]:
#define custom tranformer
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, data_dict):
        return data_dict[self.key]

In [14]:
numeric_features = ['prop_punct', 'avg_sentlength', 'prop_capwords', 'prop_capI']
tfvect = CountVectorizer()
tfidfvect = TfidfVectorizer()
svm = SVC()
linsvm = LinearSVC()
#make the individual pipelines

# pipe numeric features
numfeat_pipe = Pipeline([
    ('selector', ItemSelector(key=numeric_features))
])

# pipe word n-grams
text_pipe = Pipeline([
    ('selector', ItemSelector(key='text')),
    ('vect', TfidfVectorizer(
        tokenizer=word_tokenize,
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

# pipe char n-grams
char_pipe = Pipeline([
    ('selector', ItemSelector(key="text")),
    ('vect', TfidfVectorizer(
        analyzer='char',
        lowercase=True,
        ngram_range=(2,3)))
])

# pipe POS n-grams
pos_pipe = Pipeline([
    ('selector', ItemSelector(key="POS")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,3)))
])

# pipe dependencies
dep_pipe = Pipeline([
    ('selector', ItemSelector(key="DEP")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

In [15]:
# pipe complete
pipe = Pipeline([
    ('feats', FeatureUnion([
        ('numfeat_pipe', numfeat_pipe),
        ('text_pipe', text_pipe),
        ('char_pipe', char_pipe),
        ('pos_pipe', pos_pipe)
    ])),
    ('cls', linsvm)
])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(x, y['Language'], test_size= 0.10)

In [18]:
pipe.fit(X_train, y_train)



Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('numfeat_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key=['prop_punct',
                                                                                    'avg_sentlength',
                                                                                    'prop_capwords',
                                                                                    'prop_capI']))])),
                                                ('text_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                                                                  TfidfVectorizer(ngram_range=(1,
                      

In [9]:
# prediction
y_test_pred = pipe.predict(X_test)

# accuracy
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         ARA       0.88      0.75      0.81       103
         DEU       0.86      0.95      0.91       101
         FRA       0.86      0.77      0.81       115
         HIN       0.77      0.79      0.78       111
         ITA       0.74      0.93      0.83        92
         JPN       0.82      0.72      0.77        98
         KOR       0.82      0.80      0.81        98
         SPA       0.73      0.71      0.72        96
         TEL       0.81      0.81      0.81       100
         TUR       0.81      0.84      0.82        89
         ZHO       0.83      0.85      0.84        97

    accuracy                           0.81      1100
   macro avg       0.81      0.81      0.81      1100
weighted avg       0.81      0.81      0.81      1100



In [19]:
# cross-validation
cv_results = cross_val_predict(pipe, x, y['Language'], cv=10, verbose=True, n_jobs=-1)
print(classification_report(y['Language'], cv_results))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 32.7min finished


              precision    recall  f1-score   support

         ARA       0.85      0.74      0.79      1000
         DEU       0.85      0.93      0.89      1000
         FRA       0.83      0.84      0.83      1000
         HIN       0.70      0.76      0.73      1000
         ITA       0.87      0.86      0.86      1000
         JPN       0.82      0.79      0.81      1000
         KOR       0.78      0.78      0.78      1000
         SPA       0.76      0.78      0.77      1000
         TEL       0.79      0.76      0.77      1000
         TUR       0.86      0.86      0.86      1000
         ZHO       0.83      0.87      0.85      1000

    accuracy                           0.81     11000
   macro avg       0.81      0.81      0.81     11000
weighted avg       0.81      0.81      0.81     11000



In [None]:
params = {"feats__text_pipe__vect__ngram_range": [(1,2), (1,3)], 
          "feats__char_pipe__vect__ngram_range": [(2,2), (2,3), (2,4)],
          "feats__pos_pipe__vect__ngram_range": [(1,2),(1,3),(1,4)]
         }

ngram_search = GridSearchCV(pipe, params, cv=5, n_jobs=-1, verbose=10)
ngram_search.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 29.7min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 50.1min
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed: 68.1min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 93.5min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed: 122.3min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed: 161.8min
[Parallel(n_jobs=-1)]: Done  77 out of  90 | elapsed: 214.6min remaining: 36.2min
[Parallel(n_jobs=-1)]: Done  87 out of  90 | elapsed: 252.4min remaining:  8.7min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 256.7min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('feats',
                                        FeatureUnion(transformer_list=[('numfeat_pipe',
                                                                        Pipeline(steps=[('selector',
                                                                                         ItemSelector(key=['prop_punct',
                                                                                                           'avg_sentlength',
                                                                                                           'prop_capwords',
                                                                                                           'prop_capI']))])),
                                                                       ('text_pipe',
                                                                        Pipeline(steps=[('selector',
                                                             

In [None]:
print(ngram_search.best_score_)
print(ngram_search.best_params_)

0.7997727272727273
{'feats__char_pipe__vect__ngram_range': (2, 4), 'feats__pos_pipe__vect__ngram_range': (1, 3), 'feats__text_pipe__vect__ngram_range': (1, 2)}


In [20]:
#refit with best ranges
numfeat_pipe = Pipeline([
    ('selector', ItemSelector(key=numeric_features))
])

# pipe word n-grams
text_pipe = Pipeline([
    ('selector', ItemSelector(key='text')),
    ('vect', TfidfVectorizer(
        tokenizer=word_tokenize,
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

# pipe char n-grams
char_pipe = Pipeline([
    ('selector', ItemSelector(key="text")),
    ('vect', TfidfVectorizer(
        analyzer='char',
        lowercase=True,
        ngram_range=(2,4)))
])

# pipe POS n-grams
pos_pipe = Pipeline([
    ('selector', ItemSelector(key="POS")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,3)))
])

# pipe complete
pipe = Pipeline([
    ('feats', FeatureUnion([
        ('numfeat_pipe', numfeat_pipe),
        ('text_pipe', text_pipe),
        ('char_pipe', char_pipe),
        ('pos_pipe', pos_pipe)
    ])),
    ('cls', linsvm)
])

In [21]:
# fit model
pipe.fit(X_train, y_train)



Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('numfeat_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key=['prop_punct',
                                                                                    'avg_sentlength',
                                                                                    'prop_capwords',
                                                                                    'prop_capI']))])),
                                                ('text_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                                                                  TfidfVectorizer(ngram_range=(1,
                      

In [22]:
# prediction
y_test_pred = pipe.predict(X_test)

# accuracy
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         ARA       0.87      0.82      0.85        97
         DEU       0.85      0.97      0.91       115
         FRA       0.90      0.85      0.87        98
         HIN       0.79      0.71      0.75       120
         ITA       0.86      0.92      0.89       101
         JPN       0.87      0.88      0.88       100
         KOR       0.89      0.80      0.84        95
         SPA       0.78      0.77      0.77        90
         TEL       0.74      0.78      0.76        91
         TUR       0.86      0.91      0.89        98
         ZHO       0.88      0.89      0.89        95

    accuracy                           0.85      1100
   macro avg       0.85      0.85      0.84      1100
weighted avg       0.85      0.85      0.84      1100



In [23]:
# cross-validation
cv_results = cross_val_predict(pipe, x, y['Language'], cv=10, verbose=True, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 41.8min finished


ValueError: Found input variables with inconsistent numbers of samples: [1100, 11000]

In [24]:
print(classification_report(y['Language'], cv_results))

              precision    recall  f1-score   support

         ARA       0.83      0.77      0.80      1000
         DEU       0.87      0.93      0.90      1000
         FRA       0.85      0.83      0.84      1000
         HIN       0.71      0.74      0.73      1000
         ITA       0.87      0.87      0.87      1000
         JPN       0.82      0.80      0.81      1000
         KOR       0.79      0.79      0.79      1000
         SPA       0.77      0.79      0.78      1000
         TEL       0.78      0.77      0.78      1000
         TUR       0.88      0.85      0.87      1000
         ZHO       0.84      0.88      0.86      1000

    accuracy                           0.82     11000
   macro avg       0.82      0.82      0.82     11000
weighted avg       0.82      0.82      0.82     11000



In [None]:
test = pd.read_csv('Featured_Data/test_features.csv')
test_predict = pipe.predict(test)