## Instalari si pregatire mediu

In [51]:
!pip install pyphen



In [52]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict, KFold, GridSearchCV, cross_val_score, train_test_split, cross_validate
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, recall_score
import pyphen
import numpy as np
import pandas as pd
from dale_chall import DALE_CHALL
from timeit import default_timer as timer
from datetime import timedelta

## Extragerea si prelucrarea datelor

In [53]:
dtypes = {"corpus": "string", "sentence": "string", "token": "string", "complexity": "float64"}
df_train = pd.read_excel('train.xlsx', dtype=dtypes, keep_default_na=False)
df_test = pd.read_excel('test.xlsx', dtype=dtypes, keep_default_na=False)

In [54]:
df_train.head()

Unnamed: 0,corpus,sentence,token,complex
0,bible,"Behold, there came up out of the river seven c...",river,0
1,bible,I am a fellow bondservant with you and with yo...,brothers,0
2,bible,"The man, the lord of the land, said to us, 'By...",brothers,0
3,bible,Shimei had sixteen sons and six daughters; but...,brothers,0
4,bible,"""He has put my brothers far from me.",brothers,0


In [55]:
word_forms = pd.read_excel('wordForms.xlsx')
word_forms = np.array(word_forms)

## Functii

In [56]:
def words_forms(word):
  return int(word.lower() in word_forms)

In [57]:
def corpus_feature(corpus):
  if corpus == 'bible':
    return [0]
  elif corpus == 'biomed':
    return [1]
  else :
    return [2]

In [58]:
def length(word):
  return len(word)

In [59]:
def nr_vowels(word):
  voc = 'aeiouAEIOU'
  cnt = 0
  for ch in word:
    if ch in voc:
      cnt += 1
  return cnt

In [60]:
def nr_consonant(word):
  not_in = "aeiouAEIOUAαΔΩπλθβδεηπσω[@_!#$%^&*()<>?/\|}{~:-]0123456789"
  cnt = 0
  for ch in word:
    if ch not in not_in:
      cnt += 1
  return cnt

In [61]:
def is_title(word):
  return int(word.istitle())

In [62]:
def is_dale_chall(word):
  return int(word.lower() in DALE_CHALL)

In [63]:
def nr_syllables(word):
  language = pyphen.Pyphen(lang = 'en')
  return len(language.inserted(word, '-').split('-'))

In [64]:
def capslock(word):
  return int(word.upper() == word and len(word) != 1)

In [65]:
def get_word_structure_features(word):
    features = []
    features.append(nr_syllables(word))
    features.append(is_dale_chall(word))
    features.append(length(word))
    features.append(nr_vowels(word))
    features.append(nr_consonant(word))
    features.append(is_title(word))
    features.append(words_forms(word))
    return np.array(features)

In [66]:
def featurize(row):
    word = row['token']
    all_features = []
    all_features.extend(corpus_feature(row['corpus']))
    all_features.extend(get_word_structure_features(word))
    return np.array(all_features)

In [67]:
def featurize_df(df):
    nr_of_features = len(featurize(df.iloc[0]))
    nr_of_examples = len(df)
    features = np.zeros((nr_of_examples, nr_of_features))
    for index, row in df.iterrows():
        row_ftrs = featurize(row)
        features[index, :] = row_ftrs
    return features

## Pregatirea datelor de training si validare pentru testare locala

In [68]:
X = featurize_df(df_train)
Y = df_train['complex'].values

In [69]:
for i in range(1, 10):
  df_train = df_train.sample(frac=1).reset_index(drop=True)

In [70]:
X_train = df_train[:6129]
X_validate = df_train[6129:]

In [71]:
X_validate = X_validate.reset_index(drop = True)

In [72]:
X_train1 = featurize_df(X_train)
Y_train1 = X_train['complex'].values

In [73]:
X_validate1 = featurize_df(X_validate)
Y_validate1 = X_validate['complex'].values

## Cross validation

In [74]:
C = [3, 4, 5, 6, 7, 8]
gamma = [0.1, 'auto', 'scale']

param_grid = dict(C = C, gamma = gamma)
model = SVC(kernel = 'rbf', class_weight = 'balanced')
cv = KFold(n_splits = 10, random_state = 1, shuffle = True)

grid = GridSearchCV(estimator = model, param_grid = param_grid, n_jobs = -1, cv = cv, scoring = 'balanced_accuracy')

In [75]:
grid_result = grid.fit(X, Y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean, param in zip(means,params):
    print("%f with: %r" % (mean, param))
  
pred_cross = grid_result.best_estimator_.predict(X)
print(confusion_matrix(Y, pred_cross))

Best: 0.806508 using {'C': 7, 'gamma': 'auto'}
0.801264 with: {'C': 3, 'gamma': 0.1}
0.801980 with: {'C': 3, 'gamma': 'auto'}
0.802096 with: {'C': 3, 'gamma': 'scale'}
0.803265 with: {'C': 4, 'gamma': 0.1}
0.804765 with: {'C': 4, 'gamma': 'auto'}
0.803854 with: {'C': 4, 'gamma': 'scale'}
0.803332 with: {'C': 5, 'gamma': 0.1}
0.803187 with: {'C': 5, 'gamma': 'auto'}
0.804072 with: {'C': 5, 'gamma': 'scale'}
0.804343 with: {'C': 6, 'gamma': 0.1}
0.804809 with: {'C': 6, 'gamma': 'auto'}
0.804216 with: {'C': 6, 'gamma': 'scale'}
0.804588 with: {'C': 7, 'gamma': 0.1}
0.806508 with: {'C': 7, 'gamma': 'auto'}
0.804359 with: {'C': 7, 'gamma': 'scale'}
0.805215 with: {'C': 8, 'gamma': 0.1}
0.805561 with: {'C': 8, 'gamma': 'auto'}
0.804214 with: {'C': 8, 'gamma': 'scale'}
[[4972 1940]
 [  32  718]]


## Testare locala

In [76]:
clf = SVC(kernel = 'rbf', C = 7, gamma = 'auto', class_weight = 'balanced')
Y_pred_clf = clf.fit(X_train1, Y_train1).predict(X_validate1)

In [77]:
print(balanced_accuracy_score(Y_validate1, Y_pred_clf))

0.7994488636363637


In [78]:
confusion_matrix(Y_validate1, Y_pred_clf)

array([[1046,  362],
       [  18,  107]])

## Antrenare si prezicere pentru modelul final

In [79]:
X_test = featurize_df(df_test)

In [80]:
start = timer()
clf = SVC(kernel = 'rbf', C = 7, gamma = 'auto', class_weight = 'balanced')
preds = clf.fit(X, Y).predict(X_test)
end = timer()
print(timedelta(seconds=end-start))

0:00:02.068023


## Punere date in fisier

In [81]:
test_id = np.arange(7663,9001) 
np.savetxt("SVM_submission.csv", np.stack((test_id,preds)).T, fmt = "%d", delimiter = ',', header = "id,complex", comments = "")