In [13]:
import pandas as pd
data = pd.read_csv("Data\data.csv",sep=",",encoding="utf-8")
print(data.head())

                                    sentence  label
0       11'yi 96'ye böl ve kalanı ekrana yaz      4
1                     Brave programını kapat      3
2  yılbaşı günüyisim doğum günü tarihi nedir      2
3                           Bilge doğum günü      2
4                            Bengü Hüp oynat      0


In [4]:
import nltk
from nltk.corpus import stopwords 

nltk.download('stopwords')
stop_word_list = stopwords.words('turkish')

import re

def preprocess_text(sen):
    #sayıları silme
    sentence = re.sub('[\d\s]',' ',str(sen))
    #noktalama işaretlerini silme
    sentence = re.sub('[^\w\s]',' ',str(sentence))
    #birden çok boşluk silme
    sentence = re.sub(r'\s+',' ',sentence)
    #tek karakterleri silme
    sentence = re.sub(r"\b[\w\w]\b",' ',str(sentence))

    #engellenecek kelimeleri silme
    WPT = nltk.WordPunctTokenizer()
    tokens = WPT.tokenize(sentence)
    filtered_tokens = [token for token in tokens if token not in stop_word_list]
    single_doc = ' '.join(filtered_tokens)

    #hüçük harf dönüştürme
    return single_doc.lower()

x = data['sentence']
y = data['label']

x = x.apply(preprocess_text)

print(x.head())
print(y.head())

0                  yi ye böl kalanı ekrana yaz
1                       brave programını kapat
2    yılbaşı günüyisim doğum günü tarihi nedir
3                             bilge doğum günü
4                              bengü hüp oynat
Name: sentence, dtype: object
0    4
1    3
2    2
3    2
4    0
Name: label, dtype: int64


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akinb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(analyzer='word', lowercase=False)
vect.fit(x)
sent_vector = vect.transform(x)


In [7]:
import joblib
joblib.dump(vect,"newsvmvectorizer.pkl")

['newsvmvectorizer.pkl']

In [8]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(sent_vector,y,test_size=0.2,random_state=0)
print(x_train)
print(y_train)

  (0, 559)	0.5543215829804151
  (0, 516)	0.46516169443164485
  (0, 262)	0.481782159353866
  (0, 104)	0.43326360471224296
  (0, 64)	0.23774099443648164
  (1, 653)	0.3056462303705505
  (1, 636)	0.4106205874041814
  (1, 562)	0.4106205874041814
  (1, 303)	0.34118089077541985
  (1, 286)	0.25803955256495853
  (1, 183)	0.6215891226057044
  (2, 597)	0.44097728026759825
  (2, 134)	0.8174565547971007
  (2, 64)	0.37054529993922036
  (3, 667)	0.5343678635984899
  (3, 234)	0.8452520253469883
  (4, 554)	0.3590477991449122
  (4, 543)	0.4179748301654555
  (4, 435)	0.2580828870451197
  (4, 308)	0.2360858811276036
  (4, 295)	0.4179748301654555
  (4, 225)	0.5747986718914231
  (4, 189)	0.26256794847053816
  (5, 662)	0.8570278316685895
  (5, 541)	0.5152701192049038
  :	:
  (2445, 651)	0.6659881135786343
  (2445, 286)	0.36665332559918024
  (2445, 84)	0.4831251698645332
  (2446, 700)	0.8339563149133267
  (2446, 286)	0.35492262870639996
  (2446, 186)	0.4225479765048263
  (2447, 666)	0.18632527389315678
  (244

In [9]:
from sklearn.svm import SVC
import joblib

# Başarı oranının değiştiği gözlemlenecektir. ( ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’ )
svc = SVC(C=0.5,kernel='linear')

svc.fit(x_train,y_train)
joblib.dump(svc, "newsvcmodel.pkl")

['newsvcmodel.pkl']

In [10]:
print(x_test)

  (0, 645)	0.516480353170578
  (0, 496)	0.516480353170578
  (0, 465)	0.2897067310860404
  (0, 272)	0.6185192798458515
  (1, 635)	0.2973088530121742
  (1, 629)	0.3558459954061369
  (1, 623)	0.41441949218327057
  (1, 334)	0.45611679878304234
  (1, 209)	0.49480859183139453
  (1, 119)	0.4004490994999277
  (2, 597)	0.44845848541624983
  (2, 438)	0.7373050509521475
  (2, 340)	0.5052388036350874
  (3, 670)	0.8509100913988512
  (3, 635)	0.5253113518244189
  (4, 666)	0.3118492956840067
  (4, 420)	0.4872367817886834
  (4, 318)	0.6049255058451385
  (4, 109)	0.54718869472203
  (5, 621)	0.46031025757900984
  (5, 450)	0.46438804554779806
  (5, 308)	0.2549725145290931
  (5, 189)	0.2835731206649055
  (5, 159)	0.6534780118649137
  (6, 706)	0.42500402104348617
  :	:
  (608, 118)	0.46152360987170726
  (608, 90)	0.46152360987170726
  (608, 64)	0.1979412047729903
  (608, 18)	0.4011278798874378
  (609, 682)	0.546399452727346
  (609, 666)	0.2553579097428686
  (609, 366)	0.546399452727346
  (609, 128)	0.39323

In [11]:
resultsvm = svc.predict(x_test)
print(resultsvm)

[0 4 3 4 0 1 1 3 1 3 3 3 3 0 4 0 4 0 1 0 1 3 3 4 2 4 3 3 0 1 4 1 1 2 4 3 4
 3 0 0 3 4 4 4 2 0 4 3 3 1 3 4 4 4 3 4 2 0 0 3 1 1 2 0 3 0 2 0 0 0 0 1 4 1
 4 4 3 0 2 0 3 1 3 3 1 2 2 3 0 0 0 3 0 1 3 3 1 0 2 3 4 1 2 1 3 0 1 3 1 4 0
 4 0 1 0 4 0 0 3 0 1 2 1 0 3 3 1 0 3 1 1 0 0 3 0 3 0 0 1 0 4 1 1 2 3 1 2 2
 4 3 2 3 3 0 0 0 3 0 3 0 0 0 0 3 0 2 4 4 1 3 1 2 4 3 2 0 3 3 0 4 3 0 0 4 0
 3 3 2 4 0 2 3 3 3 2 0 0 0 1 3 2 4 0 0 4 4 0 2 2 2 4 0 1 4 0 2 1 0 1 0 4 1
 3 0 1 0 4 4 4 4 3 4 0 3 1 3 1 2 0 3 3 0 3 1 3 4 3 3 3 2 0 3 0 4 0 1 0 1 3
 0 3 4 1 3 2 3 3 3 0 2 3 1 2 3 2 3 1 1 4 0 2 2 3 1 0 2 2 0 4 0 4 2 3 3 2 3
 4 0 0 2 2 2 1 0 1 0 2 0 0 2 0 4 4 0 4 1 4 1 4 3 4 3 1 1 3 0 3 1 0 3 4 4 3
 2 0 0 1 1 1 1 0 3 4 0 1 2 1 4 3 1 1 2 3 1 2 3 0 3 3 1 0 4 2 0 0 4 1 2 0 1
 2 0 1 0 3 3 0 0 0 3 0 0 0 4 2 0 0 2 0 3 1 0 3 4 3 2 0 2 4 3 3 1 3 1 4 1 0
 0 0 0 4 2 3 1 0 0 1 4 1 1 4 1 4 4 0 1 3 0 2 3 0 2 2 3 1 2 1 3 4 4 1 0 0 3
 1 3 0 2 2 2 0 0 1 0 1 1 1 4 4 4 2 3 3 1 3 0 1 0 0 1 3 4 3 0 4 1 2 3 1 2 3
 0 0 4 2 0 0 0 4 4 1 2 1 

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

clf = LogisticRegression(solver="liblinear").fit(x_train, y_train)
aucsvm = roc_auc_score(y_test, clf.predict_proba(x_test), multi_class='ovr')
print(aucsvm)
accsvm = accuracy_score(y_test,resultsvm)
print(accsvm)

precision_recall_fscore_support(y_test, resultsvm, average='macro')

0.9999972943722945
0.9967373572593801


(0.9976047904191617, 0.9963242375601926, 0.9969483913419304, None)