In [2]:
# 引入基本資料處理用函式庫
import numpy as np
import pandas as pd
import os
import warnings
import random

# 引入 Pytorch 函式庫, 神經網路函式庫, Optimizer優化器 Loss function是要幫助我們判斷誤差值的，而Optimizer是要調整參數，來使Loss越小越好。
import torch 
from torch import nn
import torch.optim as optim

# 資料集分割器, 供多重驗證模型使用
from sklearn.model_selection import StratifiedKFold

# 引入單字,單詞分割器
import tokenizers
# 引入主要模型, RoBERTa (Robustly optimized BERT approach)
from transformers import RobertaModel, RobertaConfig, logging


warnings.filterwarnings('ignore')

In [23]:

logging.set_verbosity_warning()

此區塊主要用於調整所有用到的函式庫使用同一個種子碼，
確保程式及訓練過程及結果可以重現。確保亂數的值固定

In [3]:
def seed_everything(seed_value):
    #調整 random, numpy, pytorch, python本體 的種子碼
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    # 若有 GPU 版本 Pytorch 可使用
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

# 設定種子碼為 30
seed = 42
seed_everything(seed)

In [4]:
train = pd.read_csv('train.csv')
train.head()


Unnamed: 0.1,Unnamed: 0,id,text,sentiment,selected_text
0,0,8,"""It can go both ways . We all doubt . It is wh...",AGREE,"""It can go both ways . We all doubt . It is wh..."
1,1,8,"""It can go both ways . We all doubt . It is wh...",AGREE,"""can go both ways . We all doubt . It is what ..."
2,2,8,"""It can go both ways . We all doubt . It is wh...",AGREE,"""It can go both ways . We all doubt . It is wh..."
3,3,9,"""once again , you seem to support the killing ...",AGREE,"""seem to support the killing of certain people"""
4,4,9,"""once again , you seem to support the killing ...",AGREE,"""you seem to support the killing of certain pe..."


In [27]:
train = train[['id','text','sentiment','selected_text']]
train

Unnamed: 0,id,text,sentiment,selected_text
0,8,"""It can go both ways . We all doubt . It is wh...",AGREE,"""It can go both ways . We all doubt . It is wh..."
1,8,"""It can go both ways . We all doubt . It is wh...",AGREE,"""can go both ways . We all doubt . It is what ..."
2,8,"""It can go both ways . We all doubt . It is wh...",AGREE,"""It can go both ways . We all doubt . It is wh..."
3,9,"""once again , you seem to support the killing ...",AGREE,"""seem to support the killing of certain people"""
4,9,"""once again , you seem to support the killing ...",AGREE,"""you seem to support the killing of certain pe..."
...,...,...,...,...
76687,10001,"""And teen sex does n't , by the very nature of...",DISAGREE,"""And teen sex does n't , by the very nature of..."
76688,10002,"""Was n't sinjin crowing about his plans to tak...",DISAGREE,"""Was n't sinjin crowing about his plans to tak..."
76689,10002,"""Was n't sinjin crowing about his plans to tak...",DISAGREE,"""Was n't sinjin crowing about his plans to tak..."
76690,10003,"""Hi Smallax , welcome to the forum . I did a s...",AGREE,"""Hi Smallax , welcome to the forum . I did a s..."


# DataLoader

In [32]:
class ContestDataSet(torch.utils.data.Dataset):
    def __init__(self, df, max_len=500):
        self.df = df
        self.max_lan = max_len
        self.labeled = 'selected_text' in df
        ''' 
        使用 byte level version of the BPE 為語詞分割器，以下定義: 切割字串編碼
        - vocab_file :轉換為對應的編碼通常频率越高的byte索引越小
        - merges_file : 輸入的所有tokens轉化为merges.txt中對應的byte
        - lowercase : 是否將所有文字轉成小寫
        - add_prefix_space : 是否於第一個文字前加入空白
        '''
        self.tokenizer = tokenizers.ByteLevelBPETokenizer(
            vocab = r"C:\python\NLP_contest\roberta-base\vocab.json",
            merges = r"C:\python\NLP_contest\roberta-base\merges.txt",
            lowercase = True,
            add_prefix_space = True
        )
    # 定義針對此 class 呼叫 python 內建函式 len 的時候的回傳值
    def __len__(self):
        return len(self.df)
    def get_input_data(self, row):
        '''
        在該列的 text 前先加上一個空格，變小寫後根據空字元分割單詞，再以空格連接單詞
        e.g 'Some User\tInput' => ['some','user','input'] -> 'some user input'
        '''
        input_text = " "+ " ".join(row.text.lower().split())
        '''
        藉 tokenizer 將 tweet 編碼成 BERT 中所需要的編號，每個編號對應著一個『字』
        
        '''
        encoding = self.tokenizer.encode(input_text)
        # 這裡也將列資料中的 sentiment 文字編碼
        sentiment_id = self.tokenizer.encode(row.sentiment).ids
        # encoding.ids 會回傳
        ids = [0] + sentiment_id + [2, 2] + encoding.ids + [2]
        # 確認 text 長度, 若不夠需補長
        pad_len = self.max_lan - len(ids)
        if pad_len > 0 :
            ids += [1] * pad_len
            offsets += [(0,0)] * pad_len
        # 將 ids 轉為 pytorch 之 tensor
        ids = torch.tensor(ids)
        # 若 ids != 1 成立， masks 為 torch.tensor(1) , 否則 torch.tensor(0) #?
        masks = torch.where(ids!=1, torch.tensor(1), torch.tensor(0))
        # 將 offsets 轉為 pytorch 之 tensor
        offsets = torch.tensor(offsets)

        return ids, masks, input_text, offsets 

    '''
    此資料集的目標是指出該列 Text 能夠判斷語氣的部份, 
    放置於 train 資料集的 selected_text 欄位
    '''
    def get_target_idx(self, row, input_text, offsets):
        # 同上 text 處理方法
        selected_text = " " +  " ".join(row.selected_text.lower().split())

        # 取出 selected_text 的長度
        len_st = len(selected_text) - 1
        # 建立 text 之 index 用 
        idx0 = None
        idx1 = None
        for ind in (i for i, e in enumerate(input_text) if e == selected_text[1]):
            # 若 " " + tweet[ind: ind+len_st] 的組合 和 selected_text 一樣
            if " " + input_text[ind: ind+len_st] == selected_text:
                # 設定 idx0 為起始點, idx1 為終止點
                idx0 = ind
                idx1 = ind + len_st - 1
                break
        
        # 先以 len(tweet) 個 [0] 初始化 char_targets
        char_targets = [0] * len(input_text)
        if idx0 != None and idx1 != None:
        # 將 char_targets 對應 tweet 的 selected_text 位置 (idx0 ~ idx1 的範圍) 設為 1
            for ct in range(idx0, idx1 + 1):
                char_targets[ct] = 1

        # 藉 offset 製造 target_idx 做訓練使用
        target_idx = []
        for j, (offset1, offset2) in enumerate(offsets):
            # 若有發現 char_targets 中 範圍 offset1 至 offset2 的和大於 0 (代表有值)，
            # 則將其 index 放入 target_idx
            if sum(char_targets[offset1: offset2]) > 0:
                target_idx.append(j)

        # 起始 idx 為 target_idx 中第一個，終止 idx 則為最後一個
        start_idx = target_idx[0]
        end_idx = target_idx[-1]
        
        return start_idx, end_idx


    # 賦予此 Class 用 index 取值的能力， e.g. TweetDataset[1]
    def __getitem__(self, index):
        # 建立空的 dictionary
        data = {}
        row = self.df.iloc[index]

        # 使用 class 函式 get_input_data 根據 index row 取值且放入剛剛的 data dictionary
        ids, masks, input_text, offsets = self.get_input_data(row)
        data['ids'] = ids
        data['masks'] = masks
        # 由於 padding 會替不等長的句子們補0 ， 這時候利用masks就可以標註出非 0 的區域，也就是讓模型不被 padding 補的 0 影響判斷。
        data['input_text'] = input_text
        data['offsets'] = offsets

        # 若 labeled 不為空集合則執行
        if self.labeled:
            # 使用 class 函式 get_target_idx, 額外針對目標取出 start_idx, end_idx 
            start_idx, end_idx = self.get_target_idx(row, input_text, offsets)
            data['start_idx'] = start_idx
            data['end_idx'] = end_idx
            
        # 回傳 data dictionary
        return data


In [33]:
'''
傳入 dataframe, 分割後之 train 及 val 對應的 idx, 及預設為 4 的 batch_size
回傳有 train 及 val DataLoader 的 dictionary
'''
def get_train_val_loaders(df, train_idx, val_idx, batch_size=8):
    # 藉 train_idx 及 val_idx 將 dataframe 分割成訓練及驗證 dataframe
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    
    train_loader = torch.utils.data.DataLoader(
        ContestDataSet(train_df), 
        batch_size=batch_size, 
        shuffle=True,  # 打亂排序 
        num_workers=2, # 以兩個 子行程處理
        drop_last=True) # 當資料集 batch 無法均分時，捨棄最後一個不完整的 batch

    # 要注意不要打亂排序避免 idx 錯亂
    val_loader = torch.utils.data.DataLoader(
        ContestDataSet(val_df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)
    
    # 用 dict 儲存兩個 Loader, 並且加上對應的 Key
    dataloaders_dict = {"train": train_loader, "val": val_loader}

    return dataloaders_dict


In [34]:
'''
傳入 dataframe, 及預設為 4 的 batch_size
回傳 test 資料集使用的 Loader 
'''
def get_test_loader(df, batch_size=16):
    loader = torch.utils.data.DataLoader(
        ContestDataSet(df), 
        batch_size=batch_size, 
        shuffle=False, # 找出答案用, 所以不打亂順序
        num_workers=2)  # 以兩個 子行程 處理    
    return loader

# Model

In [35]:
'''
Model 基底繼承自 nn.Module神經網路模塊
'''
class ContestModel(nn.Module):
    def __init__(self):
        super(ContestModel, self).__init__()
        # 以 pretrained (PyTorch 提供的預訓練模型) 的 config 初始化 RoBERTa configuration, 也將隱藏層的部分讀入
        config = RobertaConfig.from_pretrained(
            r"C:\python\NLP_contest\roberta-base\config.json",
            output_hidden_states = True
        )
        # 讀入 pretrained 的 RobertaModel, 且以上面的 config 初始化
        self.roberta = RobertaModel.from_pretrained(
            r"C:\python\NLP_contest\roberta-base\pytorch_model.bin",
            config = config
        )
        # 設置一個 dropout 層工具，會隨機關閉 50% 的神經元避免過擬合
        self.dropout = nn.Dropout(0.5)
        # 建立全連接層工具，此種全連接層傳入 12 個節點(參考 config)，輸出兩個節點
        self.fc = nn.Linear(config.hidden_size, 2)
        # 以 標準差為 0.02 之 normal distribution 初始化 fc 之權重
        nn.init.normal_(self.fc.weight, std=0.02)
        # 以 均值為 0 之 normal distribution 初始化 fc 之 bias
        nn.init.normal_(self.fc.bias, 0)


    # 定義向前傳播時的行為，會輸入 指定的 ids 及 attention_mask
    def forward(self, input_ids, attention_mask):
        # 用 hs 保留輸入 input_ids, attention_mask 進 roberta 的隱藏層狀態
        _, _, hs = self.roberta(input_ids, attention_mask)
        # 沿著維度 0 號 疊起 hidden state:在LSTM 的網路結構中，直接根據當前input 資料，得到的輸出稱為
        x = torch.stack([hs[-1], hs[-2], hs[-3], hs[-4]])\
        # 沿著維度 0 取均值
        x = torch.mean(x, 0)
        # 利用上面 init 的 dropout 層工具建立 dropout
        x = self.dropout(x)
        # 利用上面 init 的全連接層工具建立 fc
        x = self.fc(x)
        # 沿著最後一個維度，一個一組進行分割
        start_logits, end_logits = x.split(1, dim=-1)
        # 將兩個結果的最後一個維度去除
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        
        # 回傳 logits (語氣句子起始位置及結束位置分布機率)
        return start_logits, end_logits


# Loss Function

In [36]:
'''
建立 Loss Function 供訓練使用，
基底是 CrossEntropy，但在此必須同時比對開頭位置及結束位置 ，CrossEntropy是在觀測預測的機率分佈與實際機率分布的誤差範圍
所以程式將兩個的 CrossEntopyLoss 加起來計算。
'''
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    ce_loss = nn.CrossEntropyLoss()
    start_loss = ce_loss(start_logits, start_positions)
    end_loss = ce_loss(end_logits, end_positions)    
    total_loss = start_loss + end_loss
    return total_loss

# Evaluation Function

In [37]:
# 藉 start_idx, end_idx, offsets 取出 test 中的 selected_text
def get_selected_text(text, start_idx, end_idx, offsets):
    selected_text = ""
    for ix in range(start_idx, end_idx + 1):
        # 先取出指定範圍
        selected_text += text[offsets[ix][0]: offsets[ix][1]]
        # 確認是否需要加上空白做辨識
        if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
            selected_text += " "
    return selected_text

# 建立 evaluation function - Jaccard index, 又稱Intersection over Union=一種測量在特定資料集中檢測相應物體準確度的一個標準
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    # 取聯集分之交集
    return float(len(c)) / (len(a) + len(b) - len(c))

# 計算 jaccard_score
def compute_jaccard_score(text, start_idx, end_idx, start_logits, end_logits, offsets):
    # 取出 機率最大的位置
    start_pred = np.argmax(start_logits)
    end_pred = np.argmax(end_logits)
    
    # 此區取出預測區段文字，第一個條件判斷出有可能是整句文字的狀況
    if start_pred > end_pred:
        pred = text
    else:
        pred = get_selected_text(text, start_pred, end_pred, offsets)
    
    # 取出正確對應語氣的文字
    true = get_selected_text(text, start_idx, end_idx, offsets)
    
    # 計算 jaccard_score
    return jaccard(true, pred)

# Training Function

In [38]:
'''
訓練模型使用， 引入 Model, 訓練及驗證 dataloader, loss function , optimizer, 訓練回數, 檔案名稱
最後會儲存訓練後的模型。
'''
def train_model(model, dataloaders_dict, criterion, optimizer, num_epochs, filename):
    # 使用 GPU
    model.cuda()

    # 根據訓練回數，每回訓練進行...
    for epoch in range(num_epochs):
        # 判斷當前階段
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
                
            # 預設 loss 及 jaccard 為 0
            epoch_loss = 0.0
            epoch_jaccard = 0.0
            
            # 取出當前階段(train 或 val) 所使用的資料集，資料若是 torch tensor，在 GPU 訓練要轉成 GPU 使用的 Tesnor
            for data in (dataloaders_dict[phase]):
                ids = data['ids'].cuda()
                masks = data['masks'].cuda()
                tweet = data['tweet']
                offsets = data['offsets'].numpy()
                start_idx = data['start_idx'].cuda()
                end_idx = data['end_idx'].cuda()
                
                # 初始化 optimizer
                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    
                    # 輸入 ids, masks 得到 model 輸出
                    start_logits, end_logits = model(ids, masks)
                    # 計算 loss
                    loss = criterion(start_logits, end_logits, start_idx, end_idx)
                    
                    # 在訓練階段要反向傳播且讓 optimizer 進行梯度下降
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    
                    # 計算各批訓練 loss 之總和，loss.item() 目的在於將 loss 取出成 python float 形式
                    epoch_loss += loss.item() * len(ids)
                    
                    # 以下步驟目的在於將 tensor 從 gpu 拿回 cpu 並且轉成 numpy array
                    # .cpu() 用於將 tensor 放回 cpu
                    # .detach() 用於阻斷反向傳播
                    # .numpy() 將 tensor 轉為 numpy array
                    start_idx = start_idx.cpu().detach().numpy()
                    end_idx = end_idx.cpu().detach().numpy()
                    start_logits = torch.softmax(start_logits, dim=1).cpu().detach().numpy()
                    end_logits = torch.softmax(end_logits, dim=1).cpu().detach().numpy()
                    
                    # 計算本回的總 jaccard 分數總合
                    for i in range(len(ids)):                        
                        jaccard_score = compute_jaccard_score(
                            tweet[i],
                            start_idx[i],
                            end_idx[i],
                            start_logits[i], 
                            end_logits[i], 
                            offsets[i])
                        epoch_jaccard += jaccard_score
            
            # 平均 loss 及 jaccard
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_jaccard = epoch_jaccard / len(dataloaders_dict[phase].dataset)
            
            # 印出當前 Loss 及 jaccard
            print('Epoch {}/{} | {:^5} | Loss: {:.4f} | Jaccard: {:.4f}'.format(
                epoch + 1, num_epochs, phase, epoch_loss, epoch_jaccard))
            
    # 儲存模型
    torch.save(model.state_dict(), filename)

# Training

In [39]:
# 定義訓練過程中數據將被輪3次
num_epochs = 3
# 每次批量訓練數量為 32
batch_size = 8
# 建立 KFold 多重驗證訓練器，分十種資料集分布且要打亂排序
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

In [40]:





# 將資料集以十種分布反覆進行訓練及驗證
for fold, (train_idx, val_idx) in enumerate(skf.split(train, train.sentiment), start=1): 
    print(f'Fold: {fold}')
    # 每種資料集分布都會建立一個新 model
    model = ContestModel()
    # 使用 AdamW 為 optimizer, 學習率 3e-5, betas 分別為 0.9 及 0.999
    optimizer = optim.AdamW(model.parameters(), lr=3e-5, betas=(0.9, 0.999))
    # 呼叫 loss function
    criterion = loss_fn
    # 根據 train_idx 及 val_idx 的不同重新建立 data loader
    dataloaders_dict = get_train_val_loaders(train, train_idx, val_idx, batch_size)

    logging.set_verbosity_warning()
    
    # 呼叫模型進行訓練，儲存的 Model 名字為 (f'roberta_fold{fold}.pth')
    train_model(
        model, 
        dataloaders_dict,
        criterion, 
        optimizer, 
        num_epochs,
        f'roberta_fold{fold}.pth',
        )

Fold: 1


Some weights of the model checkpoint at C:\python\NLP_contest\roberta-base\pytorch_model.bin were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
input_text = " "+" ".join(train['text'].iloc[0].lower().split())
input_text

' "it can go both ways . we all doubt . it is what you do with it that matters ."'

In [6]:
tokenizer = tokenizers.ByteLevelBPETokenizer(
            vocab = r"C:\python\NLP_contest\roberta-base\vocab.json",
            merges = r"C:\python\NLP_contest\roberta-base\merges.txt",
            lowercase = True,
            add_prefix_space = True
        )

In [7]:
encoding = tokenizer.encode(input_text)
encoding

Encoding(num_tokens=21, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [8]:
sentiment_id = tokenizer.encode(train.iloc[0].sentiment)
sentiment_id

Encoding(num_tokens=1, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [20]:
sentiment_id.ids

[2854]

In [9]:
ids = [0] + sentiment_id.ids + [2, 2] + encoding.ids + [2]
ids

[0,
 2854,
 2,
 2,
 22,
 405,
 64,
 213,
 258,
 1319,
 479,
 52,
 70,
 2980,
 479,
 24,
 16,
 99,
 47,
 109,
 19,
 24,
 14,
 3510,
 39058,
 2]

In [10]:
offsets = [(0, 0)]*4 + encoding.offsets + [(0, 0)]
offsets

[(0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 2),
 (2, 4),
 (4, 8),
 (8, 11),
 (11, 16),
 (16, 21),
 (21, 23),
 (23, 26),
 (26, 30),
 (30, 36),
 (36, 38),
 (38, 41),
 (41, 44),
 (44, 49),
 (49, 53),
 (53, 56),
 (56, 61),
 (61, 64),
 (64, 69),
 (69, 77),
 (77, 80),
 (0, 0)]

In [28]:
selected_text = " " +  " ".join(train['selected_text'].iloc[0].lower().split())

        # 取出 selected_text 的長度
len_st = len(selected_text) - 1
        #建立 text 之 index 用 
idx0 = None
idx1 = None
for ind in (i for i, e in enumerate(input_text) if e == selected_text[1]):
        #print(input_text[ind: ind+len_st])
        #     若 " " + tweet[ind: ind+len_st] 的組合 和 selected_text 一樣
        #
                # 設定 idx0 為起始點, idx1 為終止點
                idx0 = ind
                idx1 = ind + len_st - 1
                print(idx0, idx1)
                break
        
        # 先以 len(tweet) 個 [0] 初始化 char_targets
char_targets = [0] * len(input_text)

if idx0 != None and idx1 != None:
        # 將 char_targets 對應 tweet 的 selected_text 位置 (idx0 ~ idx1 的範圍) 設為 1
        for ct in range(idx0, idx1 + 1):
         char_targets[ct] = 1
print(char_targets)
# target_idx = []
# for j, (offset1, offset2) in enumerate(offsets):
#         #     若有發現 char_targets 中 範圍 offset1 至 offset2 的和大於 0 (代表有值)，
#         #     則將其 index 放入 target_idx
#     if sum(char_targets[offset1: offset2]) > 0:
#                 target_idx.append(j)

#         # 起始 idx 為 target_idx 中第一個，終止 idx 則為最後一個
#     start_idx = target_idx[0]
#     end_idx = target_idx[-1]

1 79
[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [17]:
selected_text = " " +  " ".join(train['selected_text'].iloc[1].lower().split())
input_text = " "+" ".join(train['text'].iloc[1].lower().split())
print(selected_text[1])
print(input_text)

"
 "it can go both ways . we all doubt . it is what you do with it that matters ."


In [14]:
train['text'].iloc[1]

Unnamed: 0.1,Unnamed: 0,id,text,sentiment,selected_text
0,0,8,"""It can go both ways . We all doubt . It is wh...",AGREE,"""It can go both ways . We all doubt . It is wh..."
1,1,8,"""It can go both ways . We all doubt . It is wh...",AGREE,"""can go both ways . We all doubt . It is what ..."
2,2,8,"""It can go both ways . We all doubt . It is wh...",AGREE,"""It can go both ways . We all doubt . It is wh..."
3,3,9,"""once again , you seem to support the killing ...",AGREE,"""seem to support the killing of certain people"""
4,4,9,"""once again , you seem to support the killing ...",AGREE,"""you seem to support the killing of certain pe..."
...,...,...,...,...,...
76687,76687,10001,"""And teen sex does n't , by the very nature of...",DISAGREE,"""And teen sex does n't , by the very nature of..."
76688,76688,10002,"""Was n't sinjin crowing about his plans to tak...",DISAGREE,"""Was n't sinjin crowing about his plans to tak..."
76689,76689,10002,"""Was n't sinjin crowing about his plans to tak...",DISAGREE,"""Was n't sinjin crowing about his plans to tak..."
76690,76690,10003,"""Hi Smallax , welcome to the forum . I did a s...",AGREE,"""Hi Smallax , welcome to the forum . I did a s..."


In [28]:
def get_len(row):
    return len(row.split())

In [20]:
train = pd.DataFrame(train)

ValueError: DataFrame constructor not properly called!

In [31]:
train

Unnamed: 0,id,text,sentiment,selected_text
0,8,"""It can go both ways . We all doubt . It is wh...",AGREE,"""It can go both ways . We all doubt . It is wh..."
1,8,"""It can go both ways . We all doubt . It is wh...",AGREE,"""can go both ways . We all doubt . It is what ..."
2,8,"""It can go both ways . We all doubt . It is wh...",AGREE,"""It can go both ways . We all doubt . It is wh..."
3,9,"""once again , you seem to support the killing ...",AGREE,"""seem to support the killing of certain people"""
4,9,"""once again , you seem to support the killing ...",AGREE,"""you seem to support the killing of certain pe..."
...,...,...,...,...
76687,10001,"""And teen sex does n't , by the very nature of...",DISAGREE,"""And teen sex does n't , by the very nature of..."
76688,10002,"""Was n't sinjin crowing about his plans to tak...",DISAGREE,"""Was n't sinjin crowing about his plans to tak..."
76689,10002,"""Was n't sinjin crowing about his plans to tak...",DISAGREE,"""Was n't sinjin crowing about his plans to tak..."
76690,10003,"""Hi Smallax , welcome to the forum . I did a s...",AGREE,"""Hi Smallax , welcome to the forum . I did a s..."


In [29]:
text_len = train['text'].apply(get_len)
text_len[text_len>500].index

Int64Index([  295,   296,   297,   298,   518,   519,   520,   521,   522,
              523,
            ...
            74787, 74788, 74789, 74790, 74791, 74792, 75298, 75299, 75300,
            75301],
           dtype='int64', length=695)

In [30]:
train = train.drop(text_len[text_len>500].index)