In [6]:
import numpy as np 
import pandas as pd
from textblob import TextBlob

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
from simpletransformers.classification import ClassificationModel

In [4]:
data = pd.read_csv('lemessi10.csv')
data

Unnamed: 0,tweet
0,leo messi cristiano special competition among ...
1,poles stop leo messi
2,la liga goal assist king champions league top ...
3,leo messi became first player score goal diffe...
4,come tomorrow start work fenerbahçe
...,...
20099,via drawing lionel messi art lionelmessi barce...
20100,lionel messi made funny comment allegations ma...
20101,lionelmessi dont worry messi father go jail gi...
20102,lionel messi without detonating bomb


In [7]:
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    return TextBlob(text).sentiment.polarity

data['Subjectivity'] = data['tweet'].apply(getSubjectivity)
data['Polarity'] = data['tweet'].apply(getPolarity)

data

Unnamed: 0,tweet,Subjectivity,Polarity
0,leo messi cristiano special competition among ...,0.586190,0.225119
1,poles stop leo messi,0.000000,0.000000
2,la liga goal assist king champions league top ...,0.766667,0.200000
3,leo messi became first player score goal diffe...,0.466667,0.125000
4,come tomorrow start work fenerbahçe,0.000000,0.000000
...,...,...,...
20099,via drawing lionel messi art lionelmessi barce...,0.000000,0.000000
20100,lionel messi made funny comment allegations ma...,1.000000,0.250000
20101,lionelmessi dont worry messi father go jail gi...,0.375000,-0.050000
20102,lionel messi without detonating bomb,0.000000,0.000000


In [8]:
def getAnalysis(score):
    if score<0:
        return 'Negative'
    elif score==0:
        return 'Neutral'
    else:
        return 'Positive'
    
data['Analysis'] = data['Polarity'].apply(getAnalysis)
data

Unnamed: 0,tweet,Subjectivity,Polarity,Analysis
0,leo messi cristiano special competition among ...,0.586190,0.225119,Positive
1,poles stop leo messi,0.000000,0.000000,Neutral
2,la liga goal assist king champions league top ...,0.766667,0.200000,Positive
3,leo messi became first player score goal diffe...,0.466667,0.125000,Positive
4,come tomorrow start work fenerbahçe,0.000000,0.000000,Neutral
...,...,...,...,...
20099,via drawing lionel messi art lionelmessi barce...,0.000000,0.000000,Neutral
20100,lionel messi made funny comment allegations ma...,1.000000,0.250000,Positive
20101,lionelmessi dont worry messi father go jail gi...,0.375000,-0.050000,Negative
20102,lionel messi without detonating bomb,0.000000,0.000000,Neutral


In [9]:
print(data.Analysis.unique())
print("Total categories",len(data.Analysis.unique()))

['Positive' 'Neutral' 'Negative']
Total categories 3


In [13]:
data['Analysis'] = pd.factorize(data.Analysis)[0]

data.head()

Unnamed: 0,tweet,Subjectivity,Polarity,Analysis
0,leo messi cristiano special competition among ...,0.58619,0.225119,0
1,poles stop leo messi,0.0,0.0,1
2,la liga goal assist king champions league top ...,0.766667,0.2,0
3,leo messi became first player score goal diffe...,0.466667,0.125,0
4,come tomorrow start work fenerbahçe,0.0,0.0,1


In [15]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=42)

In [16]:
train.shape, test.shape

((16083, 4), (4021, 4))

In [21]:
model = ClassificationModel('bert', 'bert-base-multilingual-uncased', num_labels=3, 
                            args={'reprocess_input_data': True, 'overwrite_output_dir': True, 'num_train_epochs': 1},use_cuda=False)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

In [22]:
model.train_model(train)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=16083.0), HTML(value='')))




HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 1'), FloatProgress(value=0.0, max=2011.0), HTML(value='')))







(2011, 0.18036518525586925)

In [23]:
result, model_outputs, wrong_predictions = model.eval_model(test)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4021.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=503.0), HTML(value='')))




In [24]:
predictions = model_outputs.argmax(axis=1)

In [25]:
predictions[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [28]:
actuals = test.Analysis.values
actuals[0:10]

array([1, 1, 1, 1, 1, 0, 1, 0, 0, 1], dtype=int64)

In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(actuals, predictions)

0.3292713255409102

In [30]:
sample_text = test.iloc[10]['tweet']
print(sample_text)

scene arg lionelmessi goargentina


In [31]:
model.predict([sample_text])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




(array([0], dtype=int64), array([[ 5.52086878, -0.57775038, -4.48794556]]))