In [None]:
 import pandas as pd

# transformer
from sklearn.base import BaseEstimator, TransformerMixin

# pipeline
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [None]:
toefl = pd.read_csv('toefl11_trainingdata_features')
toefl = toefl.drop(['Unnamed: 0','Unnamed: 0.1'], axis= 1)

In [None]:
test = pd.read_csv('Featured_Data/test_features.csv')

In [None]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, data_dict):
        return data_dict[self.key]

## NLI

In [None]:
#Pipeline
#define features and y 
y = toefl.drop(['Filename','text','POS','DEP', 'NER', 'prop_punct', 'prop_capwords', 'prop_capI','avg_sentlength'], axis = 1)
x = toefl.drop(['Filename', 'Language', 'Proficiency', 'NER', 'prop_capwords',], axis = 1)
X_train, X_dev, y_train, y_dev = train_test_split(x, y['Language'], test_size= 0.20)


In [None]:
numeric_features = ['prop_punct', 'avg_sentlength', 'prop_capI']
tfidfvect = TfidfVectorizer()
linsvm = LinearSVC()
#refit with best ranges
numfeat_pipe = Pipeline([
    ('selector', ItemSelector(key=numeric_features))
])

# pipe word n-grams
text_pipe = Pipeline([
    ('selector', ItemSelector(key='text')),
    ('vect', TfidfVectorizer(
        tokenizer=word_tokenize,
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

# pipe char n-grams
char_pipe = Pipeline([
    ('selector', ItemSelector(key="text")),
    ('vect', TfidfVectorizer(
        analyzer='char',
        lowercase=True,
        ngram_range=(2,4)))
])

# pipe POS n-grams
pos_pipe = Pipeline([
    ('selector', ItemSelector(key="POS")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,3)))
])

# pipe complete
pipe = Pipeline([
    ('feats', FeatureUnion([
        ('numfeat_pipe', numfeat_pipe),
        ('text_pipe', text_pipe),
        ('char_pipe', char_pipe),
        ('pos_pipe', pos_pipe)
    ])),
    ('cls', linsvm)
])

In [None]:
# fit model
pipe.fit(X_train, y_train)



Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('numfeat_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key=['prop_punct',
                                                                                    'avg_sentlength',
                                                                                    'prop_capI']))])),
                                                ('text_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                                                                  TfidfVectorizer(ngram_range=(1,
                                                                                               2),
                        

In [None]:
# prediction
y_dev_pred = pipe.predict(X_dev)

# accuracy
print(classification_report(y_dev, y_dev_pred))

              precision    recall  f1-score   support

         ARA       0.83      0.77      0.80       211
         DEU       0.82      0.95      0.88       192
         FRA       0.81      0.86      0.84       191
         HIN       0.71      0.70      0.70       208
         ITA       0.83      0.86      0.84       195
         JPN       0.82      0.77      0.79       208
         KOR       0.78      0.80      0.79       211
         SPA       0.79      0.68      0.73       200
         TEL       0.77      0.80      0.78       200
         TUR       0.85      0.84      0.84       186
         ZHO       0.85      0.85      0.85       198

    accuracy                           0.81      2200
   macro avg       0.81      0.81      0.81      2200
weighted avg       0.80      0.81      0.80      2200



In [None]:
test_predict_Language = pipe.predict(test)

In [None]:
test_predict_Language

array(['JPN', 'ARA', 'DEU', ..., 'TUR', 'TUR', 'DEU'], dtype=object)

In [None]:
test['Predicted_Language'] = test_predict_Language
test

Unnamed: 0.1,Unnamed: 0,Filename,text,POS,DEP,NER,prop_punct,prop_capwords,avg_sentlength,prop_capwords_beginningsent,prop_capI,Predicted_Language
0,0,10226.txt,I agree. Because if I had many academic subjec...,PRON VERB PUNCT SCONJ SCONJ PRON VERB ADJ ADJ ...,nsubj ROOT punct mark mark nsubj advcl amod am...,"[one, only one]",0.137255,0.252874,8.700000,1.000000,1.000000,JPN
1,1,10229.txt,"I DO NOT AGREE WITH THIS STATEMENT BECOUSE , N...",PRON VERB PART VERB ADP DET NOUN PROPN PUNCT D...,nsubj aux neg ROOT prep det compound pobj punc...,[],0.106667,0.904110,23.666667,1.000000,1.000000,ARA
2,2,10392.txt,I am not quite sure about my oppinion on that ...,PRON VERB PART ADV ADJ ADP PRON NOUN ADP DET N...,nsubj ROOT neg advmod acomp prep poss pobj pre...,"[First, one, one, first, one]",0.091743,0.064433,16.166667,1.000000,1.000000,DEU
3,3,10445.txt,It is often said that young people's interest ...,PRON VERB ADV VERB SCONJ ADJ NOUN PART NOUN AD...,nsubjpass auxpass advmod ROOT mark amod poss c...,"[the Ministry of Education, Korea, 67%, 10 yea...",0.076923,0.079038,16.166667,1.000000,1.000000,JPN
4,4,10535.txt,"In mordern society, students always face the d...",ADP ADJ NOUN PUNCT NOUN ADV VERB DET NOUN SCON...,prep amod pobj punct nsubj advmod ROOT det dob...,"[one, daily, first, first]",0.084444,0.064516,20.200000,1.000000,0.857143,ZHO
...,...,...,...,...,...,...,...,...,...,...,...,...
1095,1095,1175383.txt,There are heated disscussion about that youn...,SPACE PRON VERB ADJ NOUN ADP DET ADJ NOUN VERB...,dep expl ROOT amod attr prep mark amod nsubj p...,"[MSN, ICQ]",0.128079,0.016667,11.888889,0.884615,1.000000,ZHO
1096,1096,1175412.txt,In my personal opinion young people do not spe...,ADP PRON ADJ NOUN ADJ NOUN VERB PART VERB ADJ ...,prep poss amod pobj amod nsubj aux neg ROOT am...,[First],0.091667,0.068323,14.681818,0.954545,1.000000,HIN
1097,1097,1175488.txt,I believe that people who take risks are succe...,PRON VERB SCONJ NOUN PRON VERB NOUN VERB ADJ C...,nsubj ROOT mark nsubj nsubj relcl dobj ccomp a...,[one],0.034921,0.039604,25.250000,1.000000,1.000000,TUR
1098,1098,1175980.txt,With the rapid progress of time everything aro...,ADP DET ADJ NOUN ADP NOUN PRON ADP PRON VERB V...,prep det amod pobj prep pobj nsubj prep pobj a...,[recent years],0.103306,0.015957,15.230769,1.000000,-1.000000,TUR


## Proficiency

In [None]:
y = toefl.drop(['Filename','text','POS','DEP', 'NER', 'prop_punct', 'prop_capwords', 'prop_capI','avg_sentlength'], axis = 1)
x = toefl.drop(['Filename', 'Language', 'Proficiency', 'NER', 'prop_capwords','avg_sentlength'], axis = 1)
X_train, X_dev, y_train, y_dev = train_test_split(x, y['Proficiency'], test_size= 0.20)
# dropping avg_sent lenght as well

In [None]:
x

Unnamed: 0,text,POS,DEP,prop_punct,prop_capI
0,Some people might think that traveling in a gr...,DET NOUN VERB VERB SCONJ VERB ADP DET NOUN VER...,det nsubj aux ROOT mark csubj prep det pobj ac...,0.073634,1.000000
1,IThe importance and popularity of travelling i...,ADP NOUN CCONJ NOUN ADP VERB AUX ADV VERB PUNC...,det nsubj cc conj prep pcomp aux advmod ROOT p...,0.052023,1.000000
2,"It is an important decision, how to plan your ...",PRON AUX DET ADJ NOUN PUNCT ADV PART VERB DET ...,nsubj ROOT det amod attr punct advmod aux advc...,0.089330,0.800000
3,Some people believe that young people can enjo...,DET NOUN VERB SCONJ ADJ NOUN VERB VERB NOUN AD...,det nsubj ROOT mark amod nsubj aux ccomp dobj ...,0.096059,1.000000
4,Travelling is usually considered as good recr...,PROPN AUX SPACE ADV VERB SCONJ ADJ NOUN SPACE ...,nsubjpass auxpass advmod ROOT prep amod pobj ...,0.048387,0.000000
...,...,...,...,...,...
10995,"Nowadays, more and more people go abroad,no ma...",ADV PUNCT ADJ CCONJ ADJ NOUN VERB ADV PUNCT AD...,advmod punct amod cc conj nsubj ROOT advmod pu...,0.121294,0.000000
10996,\tIn accomplishing something that is risky com...,SPACE ADP VERB PRON DET AUX ADJ VERB DET NOUN ...,prep pcomp dobj nsubj relcl acomp ROOT det ns...,0.094148,1.000000
10997,"At the beginning of the 21st century, the incr...",ADP DET NOUN ADP DET ADJ NOUN PUNCT DET VERB N...,prep det pobj prep det amod pobj punct det amo...,0.070028,1.000000
10998,The number of cars in use across the world has...,DET NOUN ADP NOUN ADP NOUN ADP DET NOUN AUX AU...,det nsubj prep pobj prep pobj prep det pobj au...,0.090253,1.000000


In [None]:
numeric_features = ['prop_punct', 'prop_capI']
tfidfvect = TfidfVectorizer()

linsvm = LinearSVC()
#refit with best ranges
numfeat_pipe = Pipeline([
    ('selector', ItemSelector(key=numeric_features))
])

# pipe word n-grams
text_pipe = Pipeline([
    ('selector', ItemSelector(key='text')),
    ('vect', TfidfVectorizer(
        tokenizer=word_tokenize,
        analyzer='word',
        lowercase=True,
        ngram_range=(1,2)))
])

# pipe char n-grams
char_pipe = Pipeline([
    ('selector', ItemSelector(key="text")),
    ('vect', TfidfVectorizer(
        analyzer='char',
        lowercase=True,
        ngram_range=(2,3)))
])

# pipe POS n-grams
pos_pipe = Pipeline([
    ('selector', ItemSelector(key="POS")),
    ('vect', TfidfVectorizer(
        analyzer='word',
        lowercase=True,
        ngram_range=(1,4)))
])

# pipe complete
pipe = Pipeline([
    ('feats', FeatureUnion([
        ('numfeat_pipe', numfeat_pipe),
        ('text_pipe', text_pipe),
        ('char_pipe', char_pipe),
        ('pos_pipe', pos_pipe)
    ])),
    ('cls', linsvm)
])

In [None]:
# fit model
pipe.fit(X_train, y_train)

Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('numfeat_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key=['prop_punct',
                                                                                    'prop_capI']))])),
                                                ('text_pipe',
                                                 Pipeline(steps=[('selector',
                                                                  ItemSelector(key='text')),
                                                                 ('vect',
                                                                  TfidfVectorizer(ngram_range=(1,
                                                                                               2),
                                                                                  tokenizer=<function word_tokenize at 0x00000

In [None]:
from sklearn.metrics import classification_report
# prediction
y_dev_pred = pipe.predict(X_dev)

# accuracy
print(classification_report(y_dev, y_dev_pred))

              precision    recall  f1-score   support

        high       0.67      0.78      0.72       802
         low       0.81      0.36      0.50       221
      medium       0.73      0.73      0.73      1177

    accuracy                           0.71      2200
   macro avg       0.74      0.62      0.65      2200
weighted avg       0.72      0.71      0.70      2200



In [None]:
test_predict_proficiency = pipe.predict(test)
test_predict_proficiency

array(['low', 'low', 'high', ..., 'high', 'medium', 'high'], dtype=object)

In [None]:
test['Predicted_Proficiency'] = test_predict_proficiency

In [None]:
test

Unnamed: 0.1,Unnamed: 0,Filename,text,POS,DEP,NER,prop_punct,prop_capwords,avg_sentlength,prop_capwords_beginningsent,prop_capI,Predicted_Language,Predicted_Proficiency
0,0,10226.txt,I agree. Because if I had many academic subjec...,PRON VERB PUNCT SCONJ SCONJ PRON VERB ADJ ADJ ...,nsubj ROOT punct mark mark nsubj advcl amod am...,"[one, only one]",0.137255,0.252874,8.700000,1.000000,1.000000,JPN,low
1,1,10229.txt,"I DO NOT AGREE WITH THIS STATEMENT BECOUSE , N...",PRON VERB PART VERB ADP DET NOUN PROPN PUNCT D...,nsubj aux neg ROOT prep det compound pobj punc...,[],0.106667,0.904110,23.666667,1.000000,1.000000,ARA,low
2,2,10392.txt,I am not quite sure about my oppinion on that ...,PRON VERB PART ADV ADJ ADP PRON NOUN ADP DET N...,nsubj ROOT neg advmod acomp prep poss pobj pre...,"[First, one, one, first, one]",0.091743,0.064433,16.166667,1.000000,1.000000,DEU,high
3,3,10445.txt,It is often said that young people's interest ...,PRON VERB ADV VERB SCONJ ADJ NOUN PART NOUN AD...,nsubjpass auxpass advmod ROOT mark amod poss c...,"[the Ministry of Education, Korea, 67%, 10 yea...",0.076923,0.079038,16.166667,1.000000,1.000000,JPN,high
4,4,10535.txt,"In mordern society, students always face the d...",ADP ADJ NOUN PUNCT NOUN ADV VERB DET NOUN SCON...,prep amod pobj punct nsubj advmod ROOT det dob...,"[one, daily, first, first]",0.084444,0.064516,20.200000,1.000000,0.857143,ZHO,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,1095,1175383.txt,There are heated disscussion about that youn...,SPACE PRON VERB ADJ NOUN ADP DET ADJ NOUN VERB...,dep expl ROOT amod attr prep mark amod nsubj p...,"[MSN, ICQ]",0.128079,0.016667,11.888889,0.884615,1.000000,ZHO,medium
1096,1096,1175412.txt,In my personal opinion young people do not spe...,ADP PRON ADJ NOUN ADJ NOUN VERB PART VERB ADJ ...,prep poss amod pobj amod nsubj aux neg ROOT am...,[First],0.091667,0.068323,14.681818,0.954545,1.000000,HIN,medium
1097,1097,1175488.txt,I believe that people who take risks are succe...,PRON VERB SCONJ NOUN PRON VERB NOUN VERB ADJ C...,nsubj ROOT mark nsubj nsubj relcl dobj ccomp a...,[one],0.034921,0.039604,25.250000,1.000000,1.000000,TUR,high
1098,1098,1175980.txt,With the rapid progress of time everything aro...,ADP DET ADJ NOUN ADP NOUN PRON ADP PRON VERB V...,prep det amod pobj prep pobj nsubj prep pobj a...,[recent years],0.103306,0.015957,15.230769,1.000000,-1.000000,TUR,medium


In [None]:
output = test[['Filename','text', 'Predicted_Language','Predicted_Proficiency']]
output

Unnamed: 0,Filename,text,Predicted_Language,Predicted_Proficiency
0,10226.txt,I agree. Because if I had many academic subjec...,JPN,low
1,10229.txt,"I DO NOT AGREE WITH THIS STATEMENT BECOUSE , N...",ARA,low
2,10392.txt,I am not quite sure about my oppinion on that ...,DEU,high
3,10445.txt,It is often said that young people's interest ...,JPN,high
4,10535.txt,"In mordern society, students always face the d...",ZHO,high
...,...,...,...,...
1095,1175383.txt,There are heated disscussion about that youn...,ZHO,medium
1096,1175412.txt,In my personal opinion young people do not spe...,HIN,medium
1097,1175488.txt,I believe that people who take risks are succe...,TUR,high
1098,1175980.txt,With the rapid progress of time everything aro...,TUR,medium


In [None]:
output.to_csv('predictions_output.csv')