## Instalari si pregatire mediu

In [79]:
!pip install pyphen



In [80]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score
import pyphen
import numpy as np
import pandas as pd
from dale_chall import DALE_CHALL
from timeit import default_timer as timer
from datetime import timedelta

## Extragerea si prelucrarea datelor

In [81]:
dtypes = {"corpus": "string", "sentence": "string", "token": "string", "complexity": "float64"}
df_train = pd.read_excel('train.xlsx', dtype=dtypes, keep_default_na=False)
df_test = pd.read_excel('test.xlsx', dtype=dtypes, keep_default_na=False)

In [82]:
df_train.head()

Unnamed: 0,corpus,sentence,token,complex
0,bible,"Behold, there came up out of the river seven c...",river,0
1,bible,I am a fellow bondservant with you and with yo...,brothers,0
2,bible,"The man, the lord of the land, said to us, 'By...",brothers,0
3,bible,Shimei had sixteen sons and six daughters; but...,brothers,0
4,bible,"""He has put my brothers far from me.",brothers,0


In [83]:
for i in range(1, 10):
  df_train = df_train.sample(frac=1).reset_index(drop=True)

In [84]:
X_train = df_train[:6129]
X_validate = df_train[6129:]

In [85]:
X_validate = X_validate.reset_index(drop = True)

In [86]:
word_forms = pd.read_excel('wordForms.xlsx')
word_forms = np.array(word_forms)

## Functii

In [87]:
def words_forms(word):
  return int(word.lower() in word_forms)

In [88]:
def corpus_feature(corpus):
  if corpus == 'bible':
    return [0]
  elif corpus == 'biomed':
    return [1]
  else :
    return [2]

In [89]:
def length(word):
  return len(word)

In [90]:
def nr_vowels(word):
  voc = 'aeiouAEIOU'
  cnt = 0
  for ch in word:
    if ch in voc:
      cnt += 1
  return cnt

In [91]:
def nr_consonant(word):
  return int(length(word)- nr_vowels(word))

In [92]:
def is_title(word):
  return int(word.istitle())

In [93]:
def is_dale_chall(word):
  return int(word.lower() in DALE_CHALL)

In [94]:
def nr_syllables(word):
  language = pyphen.Pyphen(lang = 'en')
  return len(language.inserted(word, '-').split('-'))

In [95]:
def get_word_structure_features(word):
    features = []
    features.append(nr_syllables(word))
    features.append(is_dale_chall(word))
    features.append(length(word))
    features.append(nr_vowels(word))
    features.append(nr_consonant(word))
    features.append(is_title(word))
    features.append(words_forms(word))
    return np.array(features)

In [96]:
def featurize(row):
    word = row['token']
    all_features = []
    all_features.extend(corpus_feature(row['corpus']))
    all_features.extend(get_word_structure_features(word))
    return np.array(all_features)

In [97]:
def featurize_df(df):
    nr_of_features = len(featurize(df.iloc[0]))
    nr_of_examples = len(df)
    features = np.zeros((nr_of_examples, nr_of_features))
    for index, row in df.iterrows():
        row_ftrs = featurize(row)
        features[index, :] = row_ftrs
    return features

## Pregatirea datelor de training si validare pentru testare locala

In [98]:
X_full_train = featurize_df(df_train)
Y_full_train = df_train['complex'].values

In [99]:
X_train1 = featurize_df(X_train)
Y_train1 = X_train['complex'].values

In [100]:
X_validate1 = featurize_df(X_validate)
Y_validate1 = X_validate['complex'].values

In [101]:
gnb = GaussianNB()
Y_pred_gauss = gnb.fit(X_train1, Y_train1).predict(X_validate1)

In [102]:
print(balanced_accuracy_score(Y_validate1, Y_pred_gauss))

0.8040467362669529


In [103]:
confusion_matrix(Y_validate1, Y_pred_gauss)

array([[889, 496],
       [  5, 143]])

## Cross validation

In [104]:
model = GaussianNB()
cv = KFold(n_splits = 10, random_state = 1, shuffle = True)
print(cross_val_score(model, X_full_train, Y_full_train, cv = cv, scoring='balanced_accuracy', n_jobs = -1).mean())

0.7940896423981612


## Antrenare si prezicere

In [105]:
X_test = featurize_df(df_test)

In [106]:
start = timer()
gnb = GaussianNB()
preds = gnb.fit(X_full_train, Y_full_train).predict(X_test)
end = timer()
print(timedelta(seconds=end-start))

0:00:00.005413


In [107]:
print(gnb.class_prior_) #Likelihood

[0.90211433 0.09788567]


## Punere date in fisier

In [108]:
test_id = np.arange(7663,9001) 
np.savetxt("Bayes_submission.csv", np.stack((test_id,preds)).T, fmt = "%d", delimiter = ',', header = "id,complex", comments = "")