In [14]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\robin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
#imports
import pandas as pd

# transformer
from sklearn.base import BaseEstimator, TransformerMixin

# pipeline
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report

In [15]:
#load the data
#toefl = pd.read_csv('Google Drive/NLP Shared task/toefl11_trainingdata_features') --> colab
toefl = pd.read_csv('toefl11_trainingdata_features')
toefl = toefl.drop(['Unnamed: 0','Unnamed: 0.1'], axis= 1)

In [16]:
#Pipeline
#define features and y 
x = toefl.drop(['Filename', 'Language', 'Proficiency'], axis= 1)
y = toefl.drop(['Filename','text','POS','DEP', 'NER', 'prop_punct', 'prop_capwords', 'prop_capI','avg_sentlength'], axis = 1)


In [17]:
#define custom tranformer
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, data_dict):
        return data_dict[self.key]

In [18]:
numeric_features = ['prop_punct', 'avg_sentlength', 'prop_capwords', 'prop_capI']
tfvect = CountVectorizer()
tfidfvect = TfidfVectorizer()
svm = SVC()
linsvm = LinearSVC()
#make the individual pipelines

# pipe numeric features
numfeat_pipe = Pipeline([
    ('selector', ItemSelector(key=numeric_features))
])

# pipe word n-grams
text_pipe = Pipeline([
    ('selector', ItemSelector(key='text')),
    ('vect', TfidfVectorizer(
        tokenizer=word_tokenize,
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

# pipe char n-grams
char_pipe = Pipeline([
    ('selector', ItemSelector(key="text")),
    ('vect', TfidfVectorizer(
        analyzer='char',
        lowercase=True,
        ngram_range=(2,3)))
])

# pipe POS n-grams
pos_pipe = Pipeline([
    ('selector', ItemSelector(key="POS")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,3)))
])

# pipe dependencies
dep_pipe = Pipeline([
    ('selector', ItemSelector(key="DEP")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

In [19]:
# pipe complete
pipe = Pipeline([
    ('feats', FeatureUnion([
        ('numfeat_pipe', numfeat_pipe),
        ('text_pipe', text_pipe),
        ('char_pipe', char_pipe),
        ('pos_pipe', pos_pipe)
    ])),
    ('cls', linsvm)
])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(x, y['Language'], test_size= 0.20)

In [21]:
pipe.fit(X_train, y_train)



Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('numfeat_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key=['prop_punct',
                                                                                    'avg_sentlength',
                                                                                    'prop_capwords',
                                                                                    'prop_capI']))])),
                                                ('text_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                                                                  TfidfVectorizer(ngram_range=(1,
                      

In [22]:
# prediction
y_test_pred = pipe.predict(X_test)

# accuracy
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         ARA       0.84      0.75      0.79       190
         DEU       0.84      0.93      0.88       207
         FRA       0.84      0.79      0.82       229
         HIN       0.70      0.79      0.74       184
         ITA       0.84      0.87      0.86       198
         JPN       0.86      0.76      0.80       197
         KOR       0.75      0.83      0.79       189
         SPA       0.73      0.77      0.75       216
         TEL       0.81      0.76      0.78       180
         TUR       0.88      0.82      0.85       211
         ZHO       0.82      0.82      0.82       199

    accuracy                           0.81      2200
   macro avg       0.81      0.81      0.81      2200
weighted avg       0.81      0.81      0.81      2200



In [23]:
# cross-validation
cv_results = cross_val_predict(pipe, X_test, y_test, cv=10, verbose=True, n_jobs=-1)
print(classification_report(y_test, cv_results))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


              precision    recall  f1-score   support

         ARA       0.78      0.55      0.64       190
         DEU       0.70      0.91      0.79       207
         FRA       0.67      0.75      0.71       229
         HIN       0.64      0.65      0.64       184
         ITA       0.74      0.76      0.75       198
         JPN       0.65      0.61      0.63       197
         KOR       0.59      0.62      0.61       189
         SPA       0.63      0.60      0.61       216
         TEL       0.70      0.69      0.70       180
         TUR       0.77      0.64      0.70       211
         ZHO       0.67      0.69      0.68       199

    accuracy                           0.68      2200
   macro avg       0.68      0.68      0.68      2200
weighted avg       0.68      0.68      0.68      2200



[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  4.5min finished


In [24]:
params = {"feats__text_pipe__vect__ngram_range": [(1,2), (1,3)], 
          "feats__char_pipe__vect__ngram_range": [(2,2), (2,3), (2,4)],
          "feats__pos_pipe__vect__ngram_range": [(1,2),(1,3),(1,4)]
         }

ngram_search = GridSearchCV(pipe, params, cv=5, n_jobs=-1, verbose=10)
ngram_search.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 29.7min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 50.1min
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed: 68.1min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 93.5min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed: 122.3min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed: 161.8min
[Parallel(n_jobs=-1)]: Done  77 out of  90 | elapsed: 214.6min remaining: 36.2min
[Parallel(n_jobs=-1)]: Done  87 out of  90 | elapsed: 252.4min remaining:  8.7min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 256.7min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('feats',
                                        FeatureUnion(transformer_list=[('numfeat_pipe',
                                                                        Pipeline(steps=[('selector',
                                                                                         ItemSelector(key=['prop_punct',
                                                                                                           'avg_sentlength',
                                                                                                           'prop_capwords',
                                                                                                           'prop_capI']))])),
                                                                       ('text_pipe',
                                                                        Pipeline(steps=[('selector',
                                                             

In [25]:
print(ngram_search.best_score_)
print(ngram_search.best_params_)

0.7997727272727273
{'feats__char_pipe__vect__ngram_range': (2, 4), 'feats__pos_pipe__vect__ngram_range': (1, 3), 'feats__text_pipe__vect__ngram_range': (1, 2)}


In [26]:
#refit with best ranges
numfeat_pipe = Pipeline([
    ('selector', ItemSelector(key=numeric_features))
])

# pipe word n-grams
text_pipe = Pipeline([
    ('selector', ItemSelector(key='text')),
    ('vect', TfidfVectorizer(
        tokenizer=word_tokenize,
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

# pipe char n-grams
char_pipe = Pipeline([
    ('selector', ItemSelector(key="text")),
    ('vect', TfidfVectorizer(
        analyzer='char',
        lowercase=True,
        ngram_range=(2,4)))
])

# pipe POS n-grams
pos_pipe = Pipeline([
    ('selector', ItemSelector(key="POS")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,3)))
])

# pipe complete
pipe = Pipeline([
    ('feats', FeatureUnion([
        ('numfeat_pipe', numfeat_pipe),
        ('text_pipe', text_pipe),
        ('char_pipe', char_pipe),
        ('pos_pipe', pos_pipe)
    ])),
    ('cls', linsvm)
])

In [27]:
# fit model
pipe.fit(X_train, y_train)



Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('numfeat_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key=['prop_punct',
                                                                                    'avg_sentlength',
                                                                                    'prop_capwords',
                                                                                    'prop_capI']))])),
                                                ('text_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                                                                  TfidfVectorizer(ngram_range=(1,
                      

In [28]:
# prediction
y_test_pred = pipe.predict(X_test)

# accuracy
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         ARA       0.82      0.77      0.80       190
         DEU       0.84      0.93      0.88       207
         FRA       0.84      0.79      0.81       229
         HIN       0.70      0.82      0.75       184
         ITA       0.82      0.90      0.86       198
         JPN       0.86      0.76      0.81       197
         KOR       0.75      0.83      0.79       189
         SPA       0.78      0.75      0.76       216
         TEL       0.83      0.74      0.78       180
         TUR       0.89      0.84      0.86       211
         ZHO       0.83      0.81      0.82       199

    accuracy                           0.81      2200
   macro avg       0.81      0.81      0.81      2200
weighted avg       0.82      0.81      0.81      2200



In [34]:
# cross-validation
cv_results = cross_val_predict(pipe, X_test, y_test, cv=10, verbose=True, n_jobs=-1)
print(classification_report(y_test, cv_results))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


              precision    recall  f1-score   support

         ARA       0.76      0.55      0.64       190
         DEU       0.72      0.90      0.80       207
         FRA       0.71      0.76      0.73       229
         HIN       0.65      0.64      0.64       184
         ITA       0.76      0.76      0.76       198
         JPN       0.66      0.64      0.65       197
         KOR       0.61      0.65      0.63       189
         SPA       0.61      0.62      0.61       216
         TEL       0.70      0.71      0.70       180
         TUR       0.75      0.65      0.70       211
         ZHO       0.69      0.69      0.69       199

    accuracy                           0.69      2200
   macro avg       0.69      0.69      0.69      2200
weighted avg       0.69      0.69      0.69      2200



[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.4min finished


In [31]:
test = pd.read_csv('Featured_Data/test_features.csv')
test_predict = pipe.predict(test)

In [32]:
test_predict

array(['JPN', 'ARA', 'DEU', ..., 'SPA', 'TUR', 'DEU'], dtype=object)

In [33]:
test_predict = pd.Series(test_predict)
test_predict.to_csv('test_language_predictions.csv')