In [1]:
from textblob import TextBlob
import pandas as pd
from spacy.tokenizer import Tokenizer
from spacy.lang.tr import Turkish
from tqdm import tqdm
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
data = pd.read_csv('lemessi10.csv')
data

Unnamed: 0,tweet
0,leo messi cristiano special competition among ...
1,poles stop leo messi
2,la liga goal assist king champions league top ...
3,leo messi became first player score goal diffe...
4,come tomorrow start work fenerbahçe
...,...
20099,via drawing lionel messi art lionelmessi barce...
20100,lionel messi made funny comment allegations ma...
20101,lionelmessi dont worry messi father go jail gi...
20102,lionel messi without detonating bomb


In [3]:
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    return TextBlob(text).sentiment.polarity

data['Subjectivity'] = data['tweet'].apply(getSubjectivity)
data['Polarity'] = data['tweet'].apply(getPolarity)

data

Unnamed: 0,tweet,Subjectivity,Polarity
0,leo messi cristiano special competition among ...,0.586190,0.225119
1,poles stop leo messi,0.000000,0.000000
2,la liga goal assist king champions league top ...,0.766667,0.200000
3,leo messi became first player score goal diffe...,0.466667,0.125000
4,come tomorrow start work fenerbahçe,0.000000,0.000000
...,...,...,...
20099,via drawing lionel messi art lionelmessi barce...,0.000000,0.000000
20100,lionel messi made funny comment allegations ma...,1.000000,0.250000
20101,lionelmessi dont worry messi father go jail gi...,0.375000,-0.050000
20102,lionel messi without detonating bomb,0.000000,0.000000


In [4]:
def getAnalysis(score):
    if score<0:
        return 'Negative'
    elif score==0:
        return 'Neutral'
    else:
        return 'Positive'
    
data['Analysis'] = data['Polarity'].apply(getAnalysis)
data

Unnamed: 0,tweet,Subjectivity,Polarity,Analysis
0,leo messi cristiano special competition among ...,0.586190,0.225119,Positive
1,poles stop leo messi,0.000000,0.000000,Neutral
2,la liga goal assist king champions league top ...,0.766667,0.200000,Positive
3,leo messi became first player score goal diffe...,0.466667,0.125000,Positive
4,come tomorrow start work fenerbahçe,0.000000,0.000000,Neutral
...,...,...,...,...
20099,via drawing lionel messi art lionelmessi barce...,0.000000,0.000000,Neutral
20100,lionel messi made funny comment allegations ma...,1.000000,0.250000,Positive
20101,lionelmessi dont worry messi father go jail gi...,0.375000,-0.050000,Negative
20102,lionel messi without detonating bomb,0.000000,0.000000,Neutral


In [5]:
from transformers import GPT2Model, GPT2Config

In [6]:
# Initializing a GPT2 configuration
configuration = GPT2Config()

# Initializing a model from the configuration
model = GPT2Model(configuration)

# Accessing the model configuration
configuration = model.config

In [10]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
y=str(data['tweet'])
tokenizer(y)['input_ids']

[15,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 443,
 78,
 2085,
 72,
 1067,
 396,
 10115,
 2041,
 5449,
 1871,
 2644,
 198,
 16,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 24971,
 2245,
 443,
 78,
 2085,
 72,
 198,
 17,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 8591,
 300,
 13827,
 3061,
 3342,
 5822,
 7827,
 4652,
 1353,
 2644,
 198,
 18,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 443,
 78,
 2085,
 72,
 2627,
 717,
 2137,
 4776,
 3061,
 814,
 68,
 986,
 198,
 19,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 1282,
 9439,
 923,
 670,
 277,
 877,
 47041,
 16175,
 68,
 198,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 2

In [11]:
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
y=str(data['tweet'])
tokenizer(y)['input_ids']

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1355256.0), HTML(value='')))




[15,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 443,
 78,
 2085,
 72,
 1067,
 396,
 10115,
 2041,
 5449,
 1871,
 2644,
 198,
 16,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 24971,
 2245,
 443,
 78,
 2085,
 72,
 198,
 17,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 8591,
 300,
 13827,
 3061,
 3342,
 5822,
 7827,
 4652,
 1353,
 2644,
 198,
 18,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 443,
 78,
 2085,
 72,
 2627,
 717,
 2137,
 4776,
 3061,
 814,
 68,
 986,
 198,
 19,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 1282,
 9439,
 923,
 670,
 277,
 877,
 47041,
 16175,
 68,
 198,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 220,
 2

In [13]:
from transformers import GPT2Tokenizer, GPT2Model
import torch

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

y=str(data['tweet'])

inputs = tokenizer(y, return_tensors="pt")
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=665.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=548118077.0), HTML(value='')))




Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
last_hidden_states

tensor([[[-0.0355, -0.0849, -0.3855,  ..., -0.0875,  0.0036, -0.0178],
         [ 0.2152, -0.8275,  0.6407,  ...,  0.8400, -0.4949, -0.1181],
         [ 0.0314, -0.5078,  0.6045,  ...,  0.2600,  0.0341,  0.0100],
         ...,
         [ 0.2366, -0.5852, -0.1685,  ...,  0.2800, -0.1422,  0.1913],
         [ 0.4807,  0.2491,  0.1754,  ...,  0.1078, -0.0579,  0.1380],
         [ 0.1133,  0.5095, -0.3827,  ...,  0.2266,  0.7354, -0.0030]]],
       grad_fn=<ViewBackward>)

In [16]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
inputs = tokenizer(y, return_tensors="pt")
outputs = model(**inputs, labels=inputs["input_ids"])
loss = outputs.loss
logits = outputs.logits

In [17]:
loss
logits

tensor([[[ -34.2089,  -33.6393,  -35.9190,  ...,  -43.1610,  -42.8073,
           -34.6641],
         [ -42.9984,  -44.8501,  -47.3093,  ...,  -52.0691,  -49.4424,
           -46.3432],
         [ -37.7245,  -38.1523,  -39.6390,  ...,  -46.0464,  -43.8083,
           -36.5935],
         ...,
         [ -48.6898,  -49.7292,  -49.1064,  ...,  -56.2666,  -58.7492,
           -49.3555],
         [ -95.7018,  -94.8466,  -95.8154,  ..., -101.2834, -101.3575,
           -92.6784],
         [ -70.4535,  -70.9345,  -68.2939,  ...,  -79.5183,  -77.0302,
           -66.6397]]], grad_fn=<UnsafeViewBackward>)