In [10]:
# Best SVM after grid search: {'svm__l1_ratio': 0.2, 'tfidf__ngram_range': (1, 5)}

import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import eli5

# pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(lowercase=False, analyzer='char', max_df=0.7, ngram_range=(1,5))),
    ('svm', SGDClassifier(loss='hinge', penalty='elasticnet', fit_intercept=True, class_weight='balanced', average=False, alpha=1e-5, l1_ratio=0.2))
    ])

# data
df = pd.read_csv('processed_data_nospace.csv')
X, y = df.phonetic_transcription.fillna(' '), df.language_classification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

#fitting
model = pipeline.fit(X_train, y_train)

# predicting & evaluating
y_pred = model.predict(X_test)
cr = classification_report(y_test, y_pred)
print(cr)





              precision    recall  f1-score   support

          NL       0.88      0.86      0.87     10762
          VL       0.73      0.77      0.75      5240

   micro avg       0.83      0.83      0.83     16002
   macro avg       0.81      0.81      0.81     16002
weighted avg       0.83      0.83      0.83     16002



Weight?,Feature
+5.913,h@t
+5.663,G
+5.356,dA
+5.279,@n
+5.164,v@r
+4.998,gd
+4.712,pf
+4.525,tx
+4.476,@r
+4.344,tx@


In [11]:
# top features
feature_names = model.steps[0][1].get_feature_names()
eli5.explain_weights(model.steps[1][1], top=110, feature_names=feature_names, target_names=['VL', 'NL'], targets=['VL', 'NL'])

Weight?,Feature
+5.913,h@t
+5.663,G
+5.356,dA
+5.279,@n
+5.164,v@r
+4.998,gd
+4.712,pf
+4.525,tx
+4.476,@r
+4.344,tx@
