In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\robin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
#imports
import pandas as pd

# transformer
from sklearn.base import BaseEstimator, TransformerMixin

# pipeline
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report

In [3]:
# Mount Google drive
#from google.colab import drive
#drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [9]:
toefl = pd.read_csv('toefl11_trainingdata_features')
toefl = toefl.drop(['Unnamed: 0','Unnamed: 0.1'], axis= 1)

In [None]:
#load the data
toefl = pd.read_csv('Google Drive/NLP Shared task/toefl11_trainingdata_features')

In [None]:
toefl = pd.read_csv('toefl11_trainingdata_features')
toefl = toefl.drop(['Unnamed: 0','Unnamed: 0.1'], axis= 1)

In [10]:
#Pipeline
#define features and y 
x = toefl.drop(['Filename', 'Language', 'Proficiency'], axis= 1)
y = toefl.drop(['Filename','text','POS','DEP', 'NER', 'prop_punct', 'prop_capwords', 'prop_capI','avg_sentlength'], axis = 1)


In [11]:
#define custom tranformer
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, data_dict):
        return data_dict[self.key]

In [12]:
numeric_features = ['prop_punct', 'avg_sentlength', 'prop_capwords', 'prop_capI']
tfvect = CountVectorizer()
tfidfvect = TfidfVectorizer()
svm = SVC()
linsvm = LinearSVC()
#make the individual pipelines

# pipe numeric features
numfeat_pipe = Pipeline([
    ('selector', ItemSelector(key=numeric_features))
])

# pipe word n-grams
text_pipe = Pipeline([
    ('selector', ItemSelector(key='text')),
    ('vect', TfidfVectorizer(
        tokenizer=word_tokenize,
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

# pipe char n-grams
char_pipe = Pipeline([
    ('selector', ItemSelector(key="text")),
    ('vect', TfidfVectorizer(
        analyzer='char',
        lowercase=True,
        ngram_range=(2,3)))
])

# pipe POS n-grams
pos_pipe = Pipeline([
    ('selector', ItemSelector(key="POS")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,3)))
])

# pipe dependencies
dep_pipe = Pipeline([
    ('selector', ItemSelector(key="DEP")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

In [13]:
# pipe complete
pipe = Pipeline([
    ('feats', FeatureUnion([
        ('numfeat_pipe', numfeat_pipe),
        ('text_pipe', text_pipe),
        ('char_pipe', char_pipe),
        ('pos_pipe', pos_pipe)
    ])),
    ('cls', linsvm)
])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(x, y['Proficiency'], test_size= 0.10)

In [17]:
pipe.fit(X_train, y_train)



Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('numfeat_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key=['prop_punct',
                                                                                    'avg_sentlength',
                                                                                    'prop_capwords',
                                                                                    'prop_capI']))])),
                                                ('text_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                                                                  TfidfVectorizer(ngram_range=(1,
                      

In [None]:
# prediction
y_test_pred = pipe.predict(X_test)

# accuracy
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

        high       0.63      0.85      0.72       747
         low       0.81      0.25      0.38       270
      medium       0.72      0.68      0.70      1183

    accuracy                           0.68      2200
   macro avg       0.72      0.59      0.60      2200
weighted avg       0.70      0.68      0.67      2200



In [18]:
# cross-validation
cv_results = cross_val_predict(pipe, x, y['Proficiency'], cv=10, verbose=True, n_jobs=-1)
print(classification_report(y['Proficiency'], cv_results))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 13.6min finished


              precision    recall  f1-score   support

        high       0.65      0.80      0.72      3835
         low       0.78      0.30      0.43      1201
      medium       0.73      0.71      0.72      5964

    accuracy                           0.70     11000
   macro avg       0.72      0.61      0.62     11000
weighted avg       0.71      0.70      0.69     11000



In [None]:
params = {"feats__text_pipe__vect__ngram_range": [(1,2), (1,3)], 
          "feats__char_pipe__vect__ngram_range": [(2,2), (2,3), (2,4)],
          "feats__pos_pipe__vect__ngram_range": [(1,2),(1,3),(1,4)]
         }

ngram_search = GridSearchCV(pipe, params, cv=5, n_jobs=-1, verbose=10)
ngram_search.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed: 22.7min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 31.2min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed: 42.6min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed: 55.3min
[Parallel(n_jobs=-1)]: Done  77 out of  90 | elapsed: 73.0min remaining: 12.3min
[Parallel(n_jobs=-1)]: Done  87 out of  90 | elapsed: 85.9min remaining:  3.0min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 88.6min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('feats',
                                        FeatureUnion(transformer_list=[('numfeat_pipe',
                                                                        Pipeline(steps=[('selector',
                                                                                         ItemSelector(key=['prop_punct',
                                                                                                           'avg_sentlength',
                                                                                                           'prop_capwords',
                                                                                                           'prop_capI']))])),
                                                                       ('text_pipe',
                                                                        Pipeline(steps=[('selector',
                                                             

In [None]:
print(ngram_search.best_score_)
print(ngram_search.best_params_)

0.7055681818181819
{'feats__char_pipe__vect__ngram_range': (2, 3), 'feats__pos_pipe__vect__ngram_range': (1, 4), 'feats__text_pipe__vect__ngram_range': (1, 2)}


In [19]:
#refit with best ranges
numfeat_pipe = Pipeline([
    ('selector', ItemSelector(key=numeric_features))
])

# pipe word n-grams
text_pipe = Pipeline([
    ('selector', ItemSelector(key='text')),
    ('vect', TfidfVectorizer(
        tokenizer=word_tokenize,
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

# pipe char n-grams
char_pipe = Pipeline([
    ('selector', ItemSelector(key="text")),
    ('vect', TfidfVectorizer(
        analyzer='char',
        lowercase=True,
        ngram_range=(2,3)))
])

# pipe POS n-grams
pos_pipe = Pipeline([
    ('selector', ItemSelector(key="POS")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,4)))
])

# pipe complete
pipe = Pipeline([
    ('feats', FeatureUnion([
        ('numfeat_pipe', numfeat_pipe),
        ('text_pipe', text_pipe),
        ('char_pipe', char_pipe),
        ('pos_pipe', pos_pipe)
    ])),
    ('cls', linsvm)
])

In [20]:
# fit model
pipe.fit(X_train, y_train)



Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('numfeat_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key=['prop_punct',
                                                                                    'avg_sentlength',
                                                                                    'prop_capwords',
                                                                                    'prop_capI']))])),
                                                ('text_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                                                                  TfidfVectorizer(ngram_range=(1,
                      

In [None]:
# prediction
y_test_pred = pipe.predict(X_test)

# accuracy
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

        high       0.65      0.83      0.73       747
         low       0.83      0.27      0.41       270
      medium       0.73      0.71      0.72      1183

    accuracy                           0.70      2200
   macro avg       0.74      0.60      0.62      2200
weighted avg       0.71      0.70      0.68      2200



In [21]:
# cross-validation
# cross-validation
cv_results = cross_val_predict(pipe, x, y['Proficiency'], cv=10, verbose=True, n_jobs=-1)
print(classification_report(y['Proficiency'], cv_results))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 13.3min finished


              precision    recall  f1-score   support

        high       0.66      0.79      0.72      3835
         low       0.82      0.34      0.48      1201
      medium       0.73      0.72      0.73      5964

    accuracy                           0.71     11000
   macro avg       0.74      0.62      0.64     11000
weighted avg       0.72      0.71      0.70     11000



In [None]:
test = pd.read_csv('test_features.csv')
test_predict = pipe.predict(test)

In [None]:
test_predict

(1100,)

In [None]:
test_predict = pd.Series(test_predict)
test_predict.to_csv('test_proficiency_predictions.csv')