## Mount Drive

In [None]:
from google.colab import drive
drive.mount("gdrive")

Drive already mounted at gdrive; to attempt to forcibly remount, call drive.mount("gdrive", force_remount=True).


## Importing Necessary Modules

In [None]:
!pip install sklearn-crfsuite
!pip install eli5
import sys
import os

py_file_location = "/content/gdrive/MyDrive/CS-445"
sys.path.append(os.path.abspath(py_file_location))
from Gazetteers import gazetteers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.model_selection import cross_validate
import eli5





## Importing dataset and morphological analysis of the words in it

In [None]:
with open("/content/gdrive/MyDrive/CS-445/NE.txt",encoding="utf-8",) as file:
  lines = file.readlines()

In [None]:
with open("/content/gdrive/MyDrive/CS-445/NE.ma.txt",encoding="utf-8",) as file_ma:
  lines_ma = file_ma.readlines()

Following features will be used while developing the CRF model. Most of these features were already existing in the morphological analysis of the training set. In order to train the model, it was a necessity to parse those features.

- Root (Stem)
- Part-of-Speech (POS)
- Proper Noun (PROP)
- Noun Case (NCS)
- Orthographic Case (OCS)
- All Inflectional Features (INF)
- Start of the Sentence (SS)


## Pre-processing

### Label Parsing

In [None]:
train_sents = []
for index,line in enumerate(lines):
  splitted = line.split(" ")
  splitted = splitted[:-1]
  processed = []
  while(len(splitted)!=0):
    el = splitted.pop(0)
    if (el.find("<b_enamex")==-1 and el!=''):
      if (el.find("'")!=-1 and el.find("' '")!=-1):
        processed[-1][0]+=el
      else:
        processed.append([el,{},"O"])
    elif (el!=''):
      concat = el
      while (el.find("<e_enamex")==-1):
        el = splitted.pop(0)
        concat = concat + " " +el
      ext = re.findall(r'>([\w\d\s\W\D\S]*)<e_enamex>',concat)
      id = re.findall(r'TYPE="(\w{3})',concat)[0]
      if (len(ext[0].split(" "))>1):
        for index, e in enumerate(ext[0].split(" ")):
          if (index == 0):
            processed.append([e,{},f"B-{id}"])
          elif (index == len(ext[0].split(" "))-1):
            processed.append([e,{},f"L-{id}"])
          else:
            processed.append([e,{},f"I-{id}"]) 
      else:
        processed.append([ext[0],{},f"U-{id}"])
  train_sents.append(processed)

In [None]:
print((train_sents[9998]))

[['Başörtülülerin', {}, 'O'], [',', {}, 'O'], ['üniversitelere', {}, 'O'], ['kayıt', {}, 'O'], ['yaptıramaması', {}, 'O'], ['da', {}, 'O'], [',', {}, 'O'], ['DSP', {}, 'U-ORG'], ["'ye", {}, 'O'], ['-', {}, 'O'], ['bir', {}, 'O'], ['ölçüde', {}, 'O'], ['-', {}, 'O'], ['prim', {}, 'O'], ['kazandırdı', {}, 'O']]


### Bining the dataset
k-Fold process developed as manually.

In [None]:
folds_X = [[],[],[],[],[]]
folds_y = [[],[],[],[],[]]
for index,inst in enumerate(train_sents):
  folds_X[index%5].append(inst)
train_sents_ = folds_X[0] + folds_X[1] + folds_X[2] + folds_X[3] + folds_X[4] 


In [None]:
print((train_sents_[9998]))

[['Hükümetin', {}, 'O'], ['en', {}, 'O'], ['başarılı', {}, 'O'], ['olduğu', {}, 'O'], ['8', {}, 'O'], ['yıl', {}, 'O'], ['kesintisiz', {}, 'O'], ['eğitim', {}, 'O'], ['yasası', {}, 'O'], [',', {}, 'O'], ['Milli', {}, 'O'], ['Eğitim', {}, 'O'], ['Bakanı', {}, 'O'], ['Hikmet', {}, 'B-PER'], ['Uluğbay', {}, 'L-PER'], ['nedeniyle', {}, 'O'], [',', {}, 'O'], ['DSP', {}, 'U-ORG'], ["'ye", {}, 'O'], ['puan', {}, 'O'], ['kazandırdı', {}, 'O']]


### Extracting a full list of the features

In [None]:
feature_set = set()
for line in lines_ma:
  splitted = line.split(" ")
  MAs = splitted[2].split("+")[1:]
  for MA in MAs:
    MA = MA.rstrip()
    if(MA not in feature_set):
      feature_set.add(MA) 

In [None]:
val = list(feature_set)

### Part-Of-Speech Tagging 
###(From Morph. Analysis)

In [None]:
POS_dict = {}
for index,line in enumerate(lines_ma):
  line = line.rstrip()
  splitted = line.split(" ")
  splitted[0] = int(splitted[0]) 
  MA = splitted[2].split("+")
  stem = MA[0]
  MA = set(MA[1:])
  MA
  tmp = {
      'stem':stem,
  }
  for v in val:
    tmp[v] = 1 if (v in MA) else 0
  if (splitted[0] not in POS_dict):
    POS_dict[splitted[0]] = {splitted[1]:tmp}
  else:
    POS_dict[splitted[0]][splitted[1]]=tmp

In [None]:
for index,train_sent in enumerate(train_sents):
  p_index= index + 1
  for ind,word in enumerate(train_sent):
    train_sents[index][ind][1] = POS_dict[p_index][word[0]] if (word[0] in POS_dict[p_index]) else ""

In [None]:
train_sents[0][0]

['Müzik',
 {'A1pl': 0,
  'A1sg': 0,
  'A2pl': 0,
  'A2sg': 0,
  'A3pl': 0,
  'A3pl^DB': 0,
  'A3sg': 1,
  'A3sg^DB': 0,
  'Abl': 0,
  'Abl^DB': 0,
  'Able': 0,
  'Able^DB': 0,
  'Acc': 0,
  'Acquire': 0,
  'Acquire^DB': 0,
  'ActOf': 0,
  'Adamantly': 0,
  'Adj': 0,
  'Adj^DB': 0,
  'Adverb': 0,
  'AfterDoingSo': 0,
  'Agt': 0,
  'Aor': 0,
  'AorPart': 0,
  'AorPart^DB': 0,
  'Aor^DB': 0,
  'AsIf': 0,
  'AsIf^DB': 0,
  'AsLongAs': 0,
  'Become': 0,
  'Become^DB': 0,
  'ByDoingSo': 0,
  'Card': 0,
  'Card^DB': 0,
  'Caus': 0,
  'Caus^DB': 0,
  'Cond': 0,
  'Conj': 0,
  'Cop': 0,
  'Dat': 0,
  'Dat^DB': 0,
  'Demons': 0,
  'Desr': 0,
  'Det': 0,
  'Dim': 0,
  'Dist': 0,
  'Dist^DB': 0,
  'Distrib': 0,
  'Dup': 0,
  'Equ': 0,
  'EverSince': 0,
  'EverSince^DB': 0,
  'FeelLike': 0,
  'Fut': 0,
  'FutPart': 0,
  'Gen': 0,
  'Gen^DB': 0,
  'Hastily': 0,
  'Imp': 0,
  'InBetween': 0,
  'Inf1': 0,
  'Inf2': 0,
  'Inf3': 0,
  'Ins': 0,
  'Ins^DB': 0,
  'Interj': 0,
  'Loc': 0,
  'Loc^DB': 0,
  

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.isupper()': word[0].isupper() if (len(word)>0) else word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.gazetteers()':1 if (word in gazetteers) else 0
    }
    features.update(postag)
    if i > 0:
        
        word1 = sent[i-1][0]
        
        postag1 = sent[i-1][1]
        postag_before = {}
        for pos in postag1:
          postag_before[pos+"_before"] = postag1[pos]
        features.update(postag_before)
        
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1[0].isupper() if (len(word1)>0) else word1.isupper(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.gazetteers()':1 if (word1 in gazetteers) else 0
        })
        
    else:
        features['BOS'] = True
    
    if i < len(sent)-1:
        
        word1 = sent[i+1][0]
        
        postag1 = sent[i+1][1]
        postag_after = {}
        for pos in postag1:
          postag_after[pos+"_after"] = postag1[pos]
        features.update(postag_after)
        
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1[0].isupper() if (len(word1)>0) else word1.isupper(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word.gazetteers()':1 if (word1 in gazetteers) else 0
        })
        
    else:
        features['EOS'] = True
    
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]


X_train = [sent2features(s) for s in train_sents]

y_train = [sent2labels(s) for s in train_sents]





Modified version which checks 2 before words and 2 after words for the current word. This functions results are also reported.

In [None]:
'''
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.isupper()': word[0].isupper() if (len(word)>0) else word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.gazetteers()':1 if (word in gazetteers) else 0
    }
    features.update(postag)
    if i > 0:
        
        word1 = sent[i-1][0]
        
        postag1 = sent[i-1][1]
        postag_before = {}
        for pos in postag1:
          postag_before[pos+"_before"] = postag1[pos]
        features.update(postag_before)
        
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1[0].isupper() if (len(word1)>0) else word1.isupper(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.gazetteers()':1 if (word1 in gazetteers) else 0
        })
        if (i>1):
          word1 = sent[i-2][0]
        
          postag1 = sent[i-2][1]
          postag_before = {}
          for pos in postag1:
            postag_before[pos+"_before_before"] = postag1[pos]
          features.update(postag_before)
          
          features.update({
              '-2:word.lower()': word1.lower(),
              '-2:word.istitle()': word1.istitle(),
              '-2:word.isupper()': word1[0].isupper() if (len(word1)>0) else word1.isupper(),
              '-2:word.isdigit()': word1.isdigit(),
              '-2:word.gazetteers()':1 if (word1 in gazetteers) else 0
          })
        
    else:
        features['BOS'] = True
    
    if i < len(sent)-1:
        
        word1 = sent[i+1][0]
        
        postag1 = sent[i+1][1]
        postag_after = {}
        for pos in postag1:
          postag_after[pos+"_after"] = postag1[pos]
        features.update(postag_after)
        
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1[0].isupper() if (len(word1)>0) else word1.isupper(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word.gazetteers()':1 if (word1 in gazetteers) else 0
        })
        if (i < len(sent)-2):
          word1 = sent[i+1][0]
        
          postag1 = sent[i+1][1]
          postag_after = {}
          for pos in postag1:
            postag_after[pos+"_after"] = postag1[pos]
          features.update(postag_after)
          
          features.update({
              '+2:word.lower()': word1.lower(),
              '+2:word.istitle()': word1.istitle(),
              '+2:word.isupper()': word1[0].isupper() if (len(word1)>0) else word1.isupper(),
              '+2:word.isdigit()': word1.isdigit(),
              '+2:word.gazetteers()':1 if (word1 in gazetteers) else 0
          })
    else:
        features['EOS'] = True
    
    return features
'''

## Cross Validation
Model tends to consume large amount of memory and time. Running in a local runtime with higher RAM can be a better solution.

In [None]:
from sklearn.metrics import make_scorer
from sklearn_crfsuite import metrics

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

f1_scorer = make_scorer(metrics.flat_f1_score, average='macro') 
precision_scorer = make_scorer(metrics.flat_precision_score, average='macro') 
recall_scorer = make_scorer(metrics.flat_recall_score, average='macro') 
classification_reports = make_scorer(metrics.flat_classification_report)
scoring_={
    "f1":f1_scorer,
    "precision":precision_scorer,
    "recall":recall_scorer
}
scores = cross_validate(crf, X_train, y_train, scoring=scoring_, cv=5)



In [None]:
print(f"F-1 Score:{scores['test_f1']}")
print(f"Precision Score:{scores['test_precision']}")
print(f"Recall Score:{scores['test_recall']}")

F-1 Score:[0.76528478 0.79371077 0.82384356 0.83574444 0.79092663]
Precision Score:[0.78653491 0.83255348 0.86734542 0.86707972 0.82891054]
Recall Score:[0.74743282 0.76914888 0.78941053 0.81623949 0.76859413]


In [None]:
from statistics import mean
print(f"Average F-1 Score:{mean(scores['test_f1'])}")
print(f"Average Precision Score:{mean(scores['test_precision'])}")
print(f"Average Recall Score:{mean(scores['test_recall'])}")

Average F-1 Score:0.8019020344838487
Average Precision Score:0.8364848145284591
Average Recall Score:0.7781651693517605


It can be observed that all features including gazetteers are implemented in the above cell. Initially, we transfer the raw dataset with words containing morphological analysis and labels into the *sent2features* function which returns words with their features (in dictionary format). In the next step, we transfer the same raw data to *sent2labels* function which simply extracts the label of the sentence.

## Final model and weights

In [None]:
X_test = X_train[6000:8000]
y_test = y_train[6000:8000]

X_train = X_train[:6000] +  X_train[8000:]
y_train = y_train[:6000] + y_train[8000:]

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train,y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

Model takes a very long training time. Without accessing a higher RAM, it is very hard to fine-tune CRF.

### Precision, Recall and F-1 Score

In [None]:
y_pred = crf.predict(X_test)

In [None]:
print(metrics.flat_classification_report(y_test,y_pred,digits=3))

              precision    recall  f1-score   support

       B-LOC      0.852     0.719     0.780        64
       B-ORG      0.867     0.793     0.828       213
       B-PER      0.833     0.929     0.878       393
       I-LOC      0.800     0.471     0.593        17
       I-ORG      0.738     0.878     0.802        90
       I-PER      0.821     0.852     0.836        27
       L-LOC      0.852     0.719     0.780        64
       L-ORG      0.867     0.793     0.828       213
       L-PER      0.826     0.921     0.871       393
           O      0.992     0.996     0.994     30796
       U-LOC      0.944     0.890     0.916       639
       U-ORG      0.963     0.853     0.905       306
       U-PER      0.917     0.798     0.853       647

    accuracy                          0.983     33862
   macro avg      0.867     0.816     0.836     33862
weighted avg      0.983     0.983     0.983     33862



In [None]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-ORG',
 'I-ORG',
 'L-ORG',
 'U-LOC',
 'B-LOC',
 'I-LOC',
 'L-LOC',
 'B-PER',
 'L-PER',
 'U-PER',
 'I-PER',
 'U-ORG']

In [None]:
print(metrics.flat_classification_report(y_test,y_pred,digits=3,labels=labels))

              precision    recall  f1-score   support

       B-ORG      0.867     0.793     0.828       213
       I-ORG      0.738     0.878     0.802        90
       L-ORG      0.867     0.793     0.828       213
       U-LOC      0.944     0.890     0.916       639
       B-LOC      0.852     0.719     0.780        64
       I-LOC      0.800     0.471     0.593        17
       L-LOC      0.852     0.719     0.780        64
       B-PER      0.833     0.929     0.878       393
       L-PER      0.826     0.921     0.871       393
       U-PER      0.917     0.798     0.853       647
       I-PER      0.821     0.852     0.836        27
       U-ORG      0.963     0.853     0.905       306

   micro avg      0.884     0.852     0.868      3066
   macro avg      0.857     0.801     0.823      3066
weighted avg      0.888     0.852     0.867      3066



In [None]:
crf.score(X_test,y_test)

0.9827535290295907

Results are close to results which has been stated by Yeniterzi in her article. In the following parts of this notebook, we will try to increase the performance of the model.

### Model Weights

In [None]:
eli5.show_weights(crf, top=30, show=['transition_features'])



From \ To,O,B-LOC,I-LOC,L-LOC,U-LOC,B-ORG,I-ORG,L-ORG,U-ORG,B-PER,I-PER,L-PER,U-PER
O,3.209,0.185,-1.856,-2.073,0.809,0.312,-3.941,-2.658,0.693,1.421,-2.009,-1.838,1.349
B-LOC,-3.28,-0.385,5.438,5.894,-2.248,-0.593,-1.27,-0.666,-0.527,-1.197,-0.161,-0.107,-1.732
I-LOC,-3.135,-0.449,5.117,5.289,-1.53,-0.883,-0.725,-0.535,-0.85,-0.975,0.0,-0.008,-1.517
L-LOC,0.375,1.616,-0.183,-0.486,-0.365,0.01,-0.77,-0.469,-0.548,-1.018,0.0,-0.074,-0.977
U-LOC,2.159,1.591,-0.869,-1.395,1.492,-0.106,-2.098,-1.404,1.082,-0.491,-0.421,-0.164,-1.257
B-ORG,-4.194,-1.162,-0.668,-0.925,-2.622,-1.377,4.647,4.46,-1.568,-1.89,-0.469,-0.68,-2.782
I-ORG,-4.471,-1.109,-1.662,-1.119,-2.679,-2.517,4.093,4.564,-2.425,-2.212,-0.649,-0.516,-2.578
L-ORG,0.678,-0.431,-0.41,-0.617,-0.762,-0.883,-1.744,-1.221,-0.235,-1.299,-0.133,-0.209,-1.733
U-ORG,1.715,0.0,-0.478,-0.489,0.767,0.0,-1.649,-1.003,-0.534,-0.242,0.0,0.0,-0.443
B-PER,-3.417,-0.388,-0.425,-0.392,-1.825,-0.677,-1.073,-0.544,-0.914,-1.942,5.67,8.049,-2.825


In [None]:
eli5.show_weights(crf, top=30, show=["targets"])



Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12
+4.841,word.lower():cumhuriyet,,,,,,,,,,,
+4.721,word.lower():başkan,,,,,,,,,,,
+4.433,Adverb,,,,,,,,,,,
+3.917,word.lower():dünya,,,,,,,,,,,
+3.897,word.lower():başbakan,,,,,,,,,,,
+3.494,word.lower():tck,,,,,,,,,,,
+3.489,word.lower():renault,,,,,,,,,,,
+3.368,word.lower():tv,,,,,,,,,,,
+3.280,Conj,,,,,,,,,,,
+3.265,word.lower():türkler,,,,,,,,,,,

Weight?,Feature
+4.841,word.lower():cumhuriyet
+4.721,word.lower():başkan
+4.433,Adverb
+3.917,word.lower():dünya
+3.897,word.lower():başbakan
+3.494,word.lower():tck
+3.489,word.lower():renault
+3.368,word.lower():tv
+3.280,Conj
+3.265,word.lower():türkler

Weight?,Feature
+1.957,+1:word.lower():cumhuriyeti
+1.576,+1:word.lower():meyhanesi
+1.572,stem:yeni
+1.559,word.lower():türkiyesi
+1.533,+1:word.lower():park
+1.515,+1:word.lower():mahallesi
+1.513,+1:word.lower():pavyonu
+1.415,+1:word.lower():türbe
+1.414,stem_after:türbe
+1.383,+1:word.lower():avrupa

Weight?,Feature
+1.718,+1:word.lower():salonu
+1.232,+1:word.lower():bahçe
+0.941,+1:word.lower():stadı
+0.870,+1:word.lower():federasyonu
+0.842,stem:yıl
+0.837,word.lower():yıl
+0.798,+1:word.istitle()
+0.790,word.lower():hırvat
+0.790,stem:Hırvat
+0.790,+1:word.lower():bölgesi

Weight?,Feature
+2.517,+1:word.lower():'nda
+2.240,word.lower():cumhuriyeti
+1.926,+1:word.lower():'nde
+1.826,stem:rafineri
+1.796,word.lower():avrupa
+1.631,word.lower():bahçe
+1.582,-1:word.lower():türkiyesi
+1.576,word.lower():camii
+1.575,word.lower():meyhanesi
+1.547,stem:Ereğli

Weight?,Feature
+8.071,word.lower():ıstanbul
+6.085,word.lower():abd
+5.430,word.lower():türkiye
+5.390,word.lower():fenerbahçe
+5.174,word.lower():galatasaray
+4.732,word.lower():i̇stanbul
+4.470,word.lower():rusya
+4.398,word.lower():urfa
+4.234,word.lower():i̇ran
+4.198,word.lower():güneydoğu

Weight?,Feature
+2.662,+1:word.lower():saray
+2.153,+1:word.lower():dgm
+2.103,+1:word.lower():bankası
+2.084,+1:word.lower():üniversitesi
+2.064,+1:word.lower():parti
+2.038,word.lower():abd
+2.025,stem:Abd
+1.977,word.lower():şampiyonlar
+1.943,+1:word.lower():belediyesi
+1.904,-1:word.lower():yabancılar

Weight?,Feature
+2.620,+1:word.lower():i̇ş
+2.366,-1:word.lower():
+1.399,Card_before
+1.374,+1:word.lower():başkanlığı
+1.368,stem_after:bilim
+1.347,stem:bilim
+1.243,word.lower():&
+1.240,-1:word.lower():new
+1.240,stem_before:New
+1.194,Num_before

Weight?,Feature
+3.146,word.lower():parti
+3.120,word.lower():dgm
+2.502,word.lower():partisi
+2.346,word.lower():saray
+2.327,word.lower():merkezi
+2.297,word.lower():konseyi
+2.281,word.lower():belediyesi
+2.271,word.lower():kurulu
+2.245,word.lower():hastanesi
+2.189,word.lower():komutanlığı

Weight?,Feature
+6.330,word.lower():trabzonspor
+5.780,word.lower():meclis
+5.226,word.lower():kongre
+5.213,word.lower():erdemir
+5.092,word.lower():milliyet
+5.092,word.lower():juventus
+5.021,word.lower():karabükspor
+4.990,word.lower():bursaspor
+4.915,word.lower():senato
+4.732,word.lower():duma

Weight?,Feature
+1.977,+1:word.lower():clinton
+1.722,stem_before:başbakan
+1.696,stem_before:yardımcı
+1.377,word.lower():i̇hsan
+1.352,word.lower():azi̇z
+1.322,Det_after
+1.319,-1:word.lower():hakem
+1.317,PresPart_after
+1.289,stem_before:hakem
+1.266,stem_after:ıler

Weight?,Feature
+1.419,+1:word.lower():hakan
+1.156,word.lower():halis
+1.156,stem:Halis
+1.156,+1:word.lower():komili
+1.067,Punct_before
+1.039,stem_before:.
+1.039,-1:word.lower():.
+0.870,+1:word.lower():halis
+0.870,stem_after:Halis
+0.868,-1:word.lower():alev

Weight?,Feature
+2.104,word.lower():clinton
+1.702,word.lower():hakan
+1.598,+1:word.lower():'in
+1.363,+1:word.lower():'le
+1.356,stem:hakan
+1.314,+1:word.lower():'i
+1.308,-1:word.lower():kutlu
+1.308,stem_before:kutlu
+1.294,word.lower():ıleri
+1.294,stem:ıler

Weight?,Feature
+4.593,+1:word.lower():bey
+4.481,word.lower():clinton
+4.473,word.lower():lippi
+4.040,word.lower():balic
+3.894,word.lower():erbakan
+3.687,word.lower():uche
+3.596,word.lower():demirel
+3.582,word.lower():toshack
+3.474,word.lower():baykal
+3.399,word.lower():çakıcı
