In [32]:
from textblob import TextBlob
import pandas as pd
from spacy.tokenizer import Tokenizer
from spacy.lang.tr import Turkish
from tqdm import tqdm
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

In [33]:
data = pd.read_csv('lemessi10.csv')
data

Unnamed: 0,tweet
0,leo messi cristiano special competition among ...
1,poles stop leo messi
2,la liga goal assist king champions league top ...
3,leo messi became first player score goal diffe...
4,come tomorrow start work fenerbahçe
...,...
20099,via drawing lionel messi art lionelmessi barce...
20100,lionel messi made funny comment allegations ma...
20101,lionelmessi dont worry messi father go jail gi...
20102,lionel messi without detonating bomb


In [34]:
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    return TextBlob(text).sentiment.polarity

data['Subjectivity'] = data['tweet'].apply(getSubjectivity)
data['Polarity'] = data['tweet'].apply(getPolarity)

data

Unnamed: 0,tweet,Subjectivity,Polarity
0,leo messi cristiano special competition among ...,0.586190,0.225119
1,poles stop leo messi,0.000000,0.000000
2,la liga goal assist king champions league top ...,0.766667,0.200000
3,leo messi became first player score goal diffe...,0.466667,0.125000
4,come tomorrow start work fenerbahçe,0.000000,0.000000
...,...,...,...
20099,via drawing lionel messi art lionelmessi barce...,0.000000,0.000000
20100,lionel messi made funny comment allegations ma...,1.000000,0.250000
20101,lionelmessi dont worry messi father go jail gi...,0.375000,-0.050000
20102,lionel messi without detonating bomb,0.000000,0.000000


In [35]:
def getAnalysis(score):
    if score<0:
        return 'Negative'
    elif score==0:
        return 'Neutral'
    else:
        return 'Positive'
    
data['Analysis'] = data['Polarity'].apply(getAnalysis)
data

Unnamed: 0,tweet,Subjectivity,Polarity,Analysis
0,leo messi cristiano special competition among ...,0.586190,0.225119,Positive
1,poles stop leo messi,0.000000,0.000000,Neutral
2,la liga goal assist king champions league top ...,0.766667,0.200000,Positive
3,leo messi became first player score goal diffe...,0.466667,0.125000,Positive
4,come tomorrow start work fenerbahçe,0.000000,0.000000,Neutral
...,...,...,...,...
20099,via drawing lionel messi art lionelmessi barce...,0.000000,0.000000,Neutral
20100,lionel messi made funny comment allegations ma...,1.000000,0.250000,Positive
20101,lionelmessi dont worry messi father go jail gi...,0.375000,-0.050000,Negative
20102,lionel messi without detonating bomb,0.000000,0.000000,Neutral


In [36]:
label_enc = preprocessing.LabelEncoder().fit_transform(data['Analysis'])
print(label_enc)

[2 1 2 ... 0 1 1]


In [37]:
vect = TfidfVectorizer(analyzer='word',lowercase=False)
sent_vector = vect.fit_transform(data['tweet'])

print(sent_vector)

  (0, 4901)	0.24890649791616995
  (0, 8400)	0.2248843852005883
  (0, 9616)	0.3005265054293155
  (0, 10463)	0.26443682295436655
  (0, 10755)	0.12437836632510421
  (0, 10674)	0.3197556424753424
  (0, 7505)	0.2278590916842625
  (0, 7275)	0.23068237973977024
  (0, 5929)	0.23467514211515997
  (0, 6787)	0.22452939908218347
  (0, 11463)	0.2566928707888658
  (0, 3862)	0.2623376404230149
  (0, 13088)	0.16746638314826084
  (0, 513)	0.2594037296105074
  (0, 2752)	0.2533717944917551
  (0, 11704)	0.2371030045770058
  (0, 3021)	0.1715437410829645
  (0, 8023)	0.04858473815579372
  (0, 7227)	0.12901752474861564
  (1, 11908)	0.5178896470172775
  (1, 9848)	0.7899569787592855
  (1, 8023)	0.11568574755708025
  (1, 7227)	0.30720529460603546
  (2, 1170)	0.2186445418362964
  (2, 5348)	0.18889864998589814
  :	:
  (20100, 7616)	0.44798571528983555
  (20100, 12290)	0.16821154871745747
  (20100, 8023)	0.06011974980385498
  (20101, 6545)	0.4695915522642063
  (20101, 13604)	0.38123226179833347
  (20101, 4240)	0.31

In [38]:
feature = sent_vector
label=data['Analysis']
print(label)
print(feature)

0        Positive
1         Neutral
2        Positive
3        Positive
4         Neutral
           ...   
20099     Neutral
20100    Positive
20101    Negative
20102     Neutral
20103     Neutral
Name: Analysis, Length: 20104, dtype: object
  (0, 4901)	0.24890649791616995
  (0, 8400)	0.2248843852005883
  (0, 9616)	0.3005265054293155
  (0, 10463)	0.26443682295436655
  (0, 10755)	0.12437836632510421
  (0, 10674)	0.3197556424753424
  (0, 7505)	0.2278590916842625
  (0, 7275)	0.23068237973977024
  (0, 5929)	0.23467514211515997
  (0, 6787)	0.22452939908218347
  (0, 11463)	0.2566928707888658
  (0, 3862)	0.2623376404230149
  (0, 13088)	0.16746638314826084
  (0, 513)	0.2594037296105074
  (0, 2752)	0.2533717944917551
  (0, 11704)	0.2371030045770058
  (0, 3021)	0.1715437410829645
  (0, 8023)	0.04858473815579372
  (0, 7227)	0.12901752474861564
  (1, 11908)	0.5178896470172775
  (1, 9848)	0.7899569787592855
  (1, 8023)	0.11568574755708025
  (1, 7227)	0.30720529460603546
  (2, 1170)	0.2186445418362

In [39]:
x_train,x_test,y_train,y_test= train_test_split(feature,label, test_size=0.20,random_state=0)

In [40]:
from sklearn.svm  import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [41]:
svm = SVC(kernel='linear')#eğitim başlıyor
fit = svm.fit(x_train,y_train)

In [42]:
res = svm.predict(x_test) #test veri kümesini sınıflandırıcıya gönderiyoruz
print(res)

['Positive' 'Neutral' 'Negative' ... 'Neutral' 'Positive' 'Neutral']


In [43]:
conf_mat = confusion_matrix(y_test,res)
print(conf_mat)

[[ 352   61   10]
 [   6 2179    9]
 [   6   60 1338]]


In [44]:
acc = accuracy_score(y_test,res)
print(acc)

0.9621984580950013


In [82]:
from sklearn.metrics import matthews_corrcoef
print(matthews_corrcoef(y_test, res)) 

0.9335620353407431


In [87]:
from sklearn.metrics import precision_score
print(precision_score(y_test, res,average='micro'))

0.9621984580950013


In [90]:
from sklearn.metrics import recall_score
print(recall_score(y_test, res, average='micro'))

0.9621984580950013


In [94]:
from sklearn.metrics import f1_score
print(f1_score(y_test, res, average='macro'))

0.9444958980098712


In [99]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, res))

ValueError: multiclass format is not supported

In [58]:
from sklearn.preprocessing import StandardScaler

# Initialise a new scaling object for normalising input data
sc=StandardScaler(with_mean=False)

# Set up the scaler just on the training set
sc.fit(x_train)

StandardScaler(copy=True, with_mean=False, with_std=True)

In [60]:
# Apply the scaler to the training and test sets
X_train_std=sc.transform(x_train)
X_test_std=sc.transform(x_test)

In [62]:
from sklearn.linear_model import LogisticRegression

ml = LogisticRegression(C=1000)
ml.fit(X_train_std,y_train);



In [69]:
# Predict training and test set labels
y_pred_train = ml.predict(X_train_std)
y_pred_test = ml.predict(X_test_std)

y_pred_test

array(['Positive', 'Neutral', 'Negative', ..., 'Positive', 'Positive',
       'Neutral'], dtype=object)

In [64]:
import numpy as np
accuracy_train = np.mean(y_pred_train == y_train)
accuracy_test = np.mean(y_pred_test == y_test)
print ('Accuracy of predicting training data =', accuracy_train)
print ('Accuracy of predicting test data =', accuracy_test)

Accuracy of predicting training data = 1.0
Accuracy of predicting test data = 0.9500124347177319


In [65]:
def calculate_sensitivity_specificity(y_test, y_pred_test):
    # Note: More parameters are defined than necessary. 
    # This would allow return of other measures other than sensitivity and specificity
    
    # Get true/false for whether a breach actually occurred
    actual_pos = y_test == 1
    actual_neg = y_test == 0
    
    # Get true and false test (true test match actual, false tests differ from actual)
    true_pos = (y_pred_test == 1) & (actual_pos)
    false_pos = (y_pred_test == 1) & (actual_neg)
    true_neg = (y_pred_test == 0) & (actual_neg)
    false_neg = (y_pred_test == 0) & (actual_pos)
    
    # Calculate accuracy
    accuracy = np.mean(y_pred_test == y_test)
    
    # Calculate sensitivity and specificity
    sensitivity = np.sum(true_pos) / np.sum(actual_pos)
    specificity = np.sum(true_neg) / np.sum(actual_neg)
    
    return sensitivity, specificity, accuracy

In [81]:
sensitivity, specificity, accuracy = calculate_sensitivity_specificity(y_test, y_pred_test)
print ('Sensitivity:', sensitivity)
print ('Specificity:', specificity)
print ('Accuracy:', accuracy)

Sensitivity: nan
Specificity: nan
Accuracy: 0.9500124347177319


