In [28]:
from textblob import TextBlob
import pandas as pd
from spacy.tokenizer import Tokenizer
from spacy.lang.tr import Turkish
from tqdm import tqdm
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

In [29]:
data = pd.read_csv('lemessi10.csv')
data

Unnamed: 0,tweet
0,leo messi cristiano special competition among ...
1,poles stop leo messi
2,la liga goal assist king champions league top ...
3,leo messi became first player score goal diffe...
4,come tomorrow start work fenerbahçe
...,...
20099,via drawing lionel messi art lionelmessi barce...
20100,lionel messi made funny comment allegations ma...
20101,lionelmessi dont worry messi father go jail gi...
20102,lionel messi without detonating bomb


In [30]:
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    return TextBlob(text).sentiment.polarity

data['Subjectivity'] = data['tweet'].apply(getSubjectivity)
data['Polarity'] = data['tweet'].apply(getPolarity)

data

Unnamed: 0,tweet,Subjectivity,Polarity
0,leo messi cristiano special competition among ...,0.586190,0.225119
1,poles stop leo messi,0.000000,0.000000
2,la liga goal assist king champions league top ...,0.766667,0.200000
3,leo messi became first player score goal diffe...,0.466667,0.125000
4,come tomorrow start work fenerbahçe,0.000000,0.000000
...,...,...,...
20099,via drawing lionel messi art lionelmessi barce...,0.000000,0.000000
20100,lionel messi made funny comment allegations ma...,1.000000,0.250000
20101,lionelmessi dont worry messi father go jail gi...,0.375000,-0.050000
20102,lionel messi without detonating bomb,0.000000,0.000000


In [31]:
def getAnalysis(score):
    if score<0:
        return 'Negative'
    elif score==0:
        return 'Neutral'
    else:
        return 'Positive'
    
data['Analysis'] = data['Polarity'].apply(getAnalysis)
data

Unnamed: 0,tweet,Subjectivity,Polarity,Analysis
0,leo messi cristiano special competition among ...,0.586190,0.225119,Positive
1,poles stop leo messi,0.000000,0.000000,Neutral
2,la liga goal assist king champions league top ...,0.766667,0.200000,Positive
3,leo messi became first player score goal diffe...,0.466667,0.125000,Positive
4,come tomorrow start work fenerbahçe,0.000000,0.000000,Neutral
...,...,...,...,...
20099,via drawing lionel messi art lionelmessi barce...,0.000000,0.000000,Neutral
20100,lionel messi made funny comment allegations ma...,1.000000,0.250000,Positive
20101,lionelmessi dont worry messi father go jail gi...,0.375000,-0.050000,Negative
20102,lionel messi without detonating bomb,0.000000,0.000000,Neutral


In [32]:
from gensim.models import Word2Vec

In [33]:
corpus_text = '\n'.join(data['tweet'])

model = Word2Vec(data['tweet'],size=100,window=5,min_count=3,workers=4)
#windows : Bir cümle içindeki mevcut ve tahmin edilen kelime arasındaki maksimum mesafe
vectors = model.wv
del model

In [46]:
from sklearn import model_selection,preprocessing
x_train,x_test,y_train,y_test = model_selection.train_test_split(data['tweet'],data['Analysis'])

In [47]:
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [49]:
x_train

array(['cristiano ronaldos second coronavirus test positive star player able play match played archrival lionel messi october champions league aliyev armeniakillscivilians cristianoronaldo covid coronavirus asi cinkimler',
       'what messi show', 'messis account man', ..., 'aiye man messi',
       'messi legend escapes ship sinks',
       'earning additional freespinoyin investmentbihterziyagil carsamba june georgefloyd messi cherry tokattay'],
      dtype='<U348')

In [48]:
x_train=np.array(x_train.to_list())
x_test=np.array(x_test.to_list())

In [50]:
vect = TfidfVectorizer(analyzer='word',lowercase=False)
sent_vector = vect.fit_transform(x_train)

print(sent_vector)

  (0, 2274)	0.2752284460130818
  (0, 774)	0.2752284460130818
  (0, 2684)	0.16365586953330805
  (0, 2740)	0.15761111785449078
  (0, 721)	0.2752284460130818
  (0, 353)	0.2752284460130818
  (0, 6509)	0.12649600293589897
  (0, 2139)	0.1444429478541431
  (0, 8268)	0.2218277119823306
  (0, 7283)	0.040496716911710334
  (0, 6706)	0.09922789302637415
  (0, 648)	0.25457022214490055
  (0, 8837)	0.16905444359321395
  (0, 7113)	0.12453453754899883
  (0, 8836)	0.1422144424818988
  (0, 38)	0.1895020410220139
  (0, 8838)	0.12599340220338143
  (0, 10687)	0.17079528974361383
  (0, 8915)	0.23040164955612302
  (0, 11184)	0.2230961381796307
  (0, 2627)	0.35918832117966093
  (0, 10006)	0.1912302652486134
  (0, 9724)	0.22061107296943888
  (0, 2736)	0.1436213593690554
  (1, 10264)	0.6728367561536944
  :	:
  (15073, 7113)	0.14259153829129856
  (15073, 8838)	0.14426193237892562
  (15074, 12078)	0.44963983109648
  (15074, 8198)	0.543059326807146
  (15074, 1962)	0.48862818224169513
  (15074, 4575)	0.5037903398923

In [51]:
vect = TfidfVectorizer(analyzer='word',lowercase=False)
sent_vector2 = vect.fit_transform(x_test)

print(sent_vector2)

  (0, 2692)	0.3476746519714153
  (0, 4984)	0.3317087383542859
  (0, 2972)	0.25508504688186306
  (0, 3352)	0.31159406604495277
  (0, 6316)	0.2843001564397281
  (0, 4196)	0.16205150486880904
  (0, 3397)	0.32038074236619063
  (0, 6368)	0.22092566090804092
  (0, 1776)	0.31159406604495277
  (0, 5006)	0.31159406604495277
  (0, 4311)	0.05398388385112272
  (0, 3867)	0.14264955406268193
  (0, 6427)	0.23974917531440298
  (0, 4043)	0.2710509604989925
  (1, 5930)	0.3371751876218296
  (1, 6114)	0.4250224613529402
  (1, 2573)	0.48953875341405584
  (1, 658)	0.18256222882989373
  (1, 2882)	0.5217730275364253
  (1, 5917)	0.3127478257081147
  (1, 4311)	0.08857937860870979
  (1, 3867)	0.23406631676463066
  (2, 3143)	0.5225781737964909
  (2, 6759)	0.5605167072891325
  (2, 7433)	0.42943441491325984
  :	:
  (5022, 2808)	0.3897210525555476
  (5022, 7207)	0.4525503942833895
  (5022, 3314)	0.38603011186519914
  (5022, 2870)	0.3104392503687393
  (5022, 3303)	0.25919221691224026
  (5022, 2295)	0.2576568298932024

In [52]:
x_train

array(['cristiano ronaldos second coronavirus test positive star player able play match played archrival lionel messi october champions league aliyev armeniakillscivilians cristianoronaldo covid coronavirus asi cinkimler',
       'what messi show', 'messis account man', ..., 'aiye man messi',
       'messi legend escapes ship sinks',
       'earning additional freespinoyin investmentbihterziyagil carsamba june georgefloyd messi cherry tokattay'],
      dtype='<U348')

In [53]:
from sklearn.svm import SVC

In [54]:
from sklearn.model_selection import cross_val_score
svc=SVC()
svc_model=svc.fit(sent_vector,y_train)

In [57]:
accuracy=model_selection.cross_val_score(svc_model,sent_vector2,y_test,cv=10).mean()
print(accuracy)

0.8396354938100481


In [60]:
#test veri kümesini sınıflandırıcıya gönder
res = svc.predict(sent_vector)
print(res)

[2 1 1 ... 1 0 1]


In [68]:
from sklearn.metrics import f1_score
score_f1 = f1_score(y_train,res, average='macro')
print('F-Measure: %.3f' % score_f1)

F-Measure: 0.991


In [69]:
from sklearn.metrics import recall_score
recall = recall_score(y_train, res, average='macro')
print('Recall: %.3f' % recall)

Recall: 0.986


In [71]:
from sklearn.metrics import matthews_corrcoef
print(matthews_corrcoef(y_train, res)) 

0.9909263073811317


In [73]:
from sklearn.metrics import precision_score
print(precision_score(y_train, res,average='macro'))

0.996146312901867
