In [6]:
import pandas as pd
import numpy as np
import gc
from collections import Counter
import csv
import math
import gzip
import pathlib
import os

from nltk.tokenize.casual import TweetTokenizer

from stan_data_imports.social.social_apis.twitter import TwitterAPI
from stan_data_imports.social.twitter_import import create_twitter_df

In [7]:
cwd = pathlib.Path.cwd()
repo_dir = cwd.parent
dataset_dir = repo_dir / "datasets" / "semeval"

# Load task A
subtask_dir = dataset_dir / "Subtask_A"

In [8]:
data = []
for file in os.listdir(subtask_dir):
    if file[-4:] == '.txt':
        with open(subtask_dir / file) as f:
            print(file)
            data += f.readlines()
data = [line.replace('\n','').replace(' ','').split('\t') for line in data]

df_a = pd.DataFrame(data, columns=["ID", "Sentiment", "Junk"])
df_a["Sentiment"] = df_a["Sentiment"].replace("negative","Negative").replace("positive", "Positive").replace("neutral", "Neutral")
df_a = df_a[["ID", "Sentiment"]]
print(df_a.shape)
df_a.drop_duplicates(subset=["ID"], inplace=True)
print(df_a.shape)

df_a.index = df_a["ID"]
df_a.drop("ID", 1, inplace=True)
map_a = df_a.to_dict(orient='index')

twitter-2013dev-A.txt
twitter-2013test-A.txt
twitter-2013train-A.txt
twitter-2014sarcasm-A.txt
twitter-2014test-A.txt
twitter-2015test-A.txt
twitter-2015train-A.txt
twitter-2016dev-A.txt
twitter-2016devtest-A.txt
twitter-2016test-A.txt
twitter-2016train-A.txt
(50334, 2)
(49570, 2)


In [9]:
print(df_a["Sentiment"].value_counts())

Neutral     22202
Positive    19636
Negative     7732
Name: Sentiment, dtype: int64


In [10]:
api = TwitterAPI()
tweets_raw = api.fortify_twitter_tweets_batch(tweet_ids=df_a.index.tolist())

In [11]:
twit_mentions = []
twit_users = []

for tweet in tweets_raw[0]:
    twit_mentions.append(api.parse_tweet_to_twitter_mention(tweet))
    
for user in tweets_raw[1]:
    twit_users.append(api.parse_user_to_twitter_user(user))
    
df_a_tweets = create_twitter_df(twit_mentions, twit_users)
df_a_tweets.head()

Total Records: 35632


Unnamed: 0,Tweet ID,Domain,Source,Url,Author ID,Date (GMT),Date (Local),Date (Local - Zone),Snippet,Sentiment,...,Bio,Profile Picture URL,Follower Count,Profile Picture,Verified,Number of Statuses,Date Created,Status Merged,Listed Count,Friends Count
0,264110966025908224,twitter.com,TwitterAPI,https://twitter.com/statuses/264110966025908224,twitter.com24698721,2012-11-01 21:05:35+00:00,,,Tonight Dr. Terrie Hale Scheckelhoff will be f...,Not Found,...,This is the official site for St. Catherine's ...,https://pbs.twimg.com/profile_images/958005546...,2619,https://pbs.twimg.com/profile_images/958005546...,False,9597,2009-03-16 14:30:52+00:00,[],53,386
1,263138318550700032,twitter.com,TwitterAPI,https://twitter.com/statuses/263138318550700032,twitter.com99380608,2012-10-30 04:40:38+00:00,,,"@solz_b He's a true Niners fan, he brought it ...",Not Found,...,Sup! Instagram: rodh80,https://pbs.twimg.com/profile_images/514561570...,1499,https://pbs.twimg.com/profile_images/514561570...,False,19213,2009-12-25 23:36:07+00:00,[],67,1626
2,250754443665080322,twitter.com,TwitterAPI,https://twitter.com/statuses/250754443665080322,twitter.com254373775,2012-09-26 00:31:32+00:00,,,Who's going to Concords football game this Sat...,Not Found,...,ya love to see it,https://pbs.twimg.com/profile_images/774637912...,657,https://pbs.twimg.com/profile_images/774637912...,False,8708,2011-02-19 04:26:45+00:00,[],0,366
3,262784216729796608,twitter.com,TwitterAPI,https://twitter.com/statuses/262784216729796608,twitter.com302369855,2012-10-29 05:13:34+00:00,,,Up 20 points in my money league with Vernon Da...,Not Found,...,Hockey addict. Thinker of things. Many unpopul...,https://pbs.twimg.com/profile_images/583859560...,202,https://pbs.twimg.com/profile_images/583859560...,False,13902,2011-05-21 02:28:33+00:00,[],1,366
4,263642549640650752,twitter.com,TwitterAPI,https://twitter.com/statuses/263642549640650752,twitter.com37473785,2012-10-31 14:04:16+00:00,,,@gleekyspnluver @flippinstarkids It says on Wi...,Not Found,...,Gay. Tattooed. Crocheter. Music lover. Potty m...,https://pbs.twimg.com/profile_images/463076068...,85,https://pbs.twimg.com/profile_images/463076068...,False,15499,2009-05-03 18:49:11+00:00,[],9,195


In [13]:
df_a_tweets["Sentiment"] = df_a_tweets["Tweet ID"].apply(lambda e: map_a[e]['Sentiment'])

In [14]:
df_a_tweets["Sentiment"].value_counts()

Neutral     16106
Positive    14302
Negative     5224
Name: Sentiment, dtype: int64

In [35]:
df = df_a_tweets

In [55]:
def my_round(x, base=.05, prec=2):
    return round(base * round(float(x) / base), prec)


def split_dataset_into_even_class_distributions(X_data, Y_data, train_size, test_size, mini_batch_size=32):
    """
    Split a dataset into train, validate, and test sets that have the same distribution of classes as the original data.
    Also this function ensures the resulting datasets are multiples of the mini_batch_size

    :param X_data: Numpy 2d array or pandas dataframe, each row is a record
    :param Y_data: Pandas Series, the categorial class labels
    :param train_size: float 0-1, quantity of the original dataset to be in the train set (val is created from the residual)
    :param test_size: float 0-1, quantity of the original dataset to be in the test set (val is created from the residual)
    :param mini_batch_size: Int, the size of the mini batches to ensure each dataset is a multiple of that values

    :return: 6 data sets of X and Y
    """

    dist = Counter(val for val in Y_data)
    print('Total class distribution:', dict(dist))

    Y_data.reset_index(drop=True, inplace=True)
    X_data.reset_index(drop=True, inplace=True)

    train_ix = []
    validate_ix = []
    test_ix = []
    for cls in dist.keys():
        num_train = my_round(dist[cls] * train_size, base=mini_batch_size)
        num_test = my_round(dist[cls] * test_size, base=mini_batch_size)

        cls_targets = Y_data[Y_data == cls]

        full_ixs = np.random.choice(cls_targets.index, size=num_train, replace=False)

        train_ix += list(full_ixs)

        cls_targets = cls_targets[~cls_targets.index.isin(full_ixs)]

        full_ixs = np.random.choice(cls_targets.index, size=num_test, replace=False)

        test_ix += list(full_ixs)

        cls_targets = cls_targets[~cls_targets.index.isin(full_ixs)]

        validate_ix += list(cls_targets.index)

    X_train = X_data[train_ix].reset_index(drop=True)
    Y_train = Y_data[train_ix].reset_index(drop=True)

    X_validate = X_data[validate_ix].reset_index(drop=True)
    Y_validate = Y_data[validate_ix].reset_index(drop=True)

    X_test = X_data[test_ix].reset_index(drop=True)
    Y_test = Y_data[test_ix].reset_index(drop=True)

    print('Train class distribution:', dict(Counter(val for val in Y_train)))
    print('Validate class distribution:', dict(Counter(val for val in Y_validate)))
    print('Test class distribution:', dict(Counter(val for val in Y_test)))

    return X_train, Y_train, X_validate, Y_validate, X_test, Y_test


def parse_df_into_sets_by_tertiary_grouping(df, ter="ID", y="Score", x="text", train_prop=.7, test_prop=.2):

    ideal_train_size = math.floor(len(df)*train_prop)
    ideal_test_size = math.floor(len(df)*test_prop)
    ideal_val_size = len(df) - (ideal_train_size + ideal_test_size)

    title_avg_count = df[ter].value_counts().mean()

    titles = df[ter].unique().tolist()

    train_titles = []
    train_count = 0

    while train_count <= ideal_train_size - title_avg_count:
        title = np.random.choice(titles)
        titles.remove(title)
        train_titles.append(title)
        train_count += len(df[df[ter] == title])

    test_titles = []
    test_count = 0

    while test_count <= ideal_test_size - title_avg_count:
        title = np.random.choice(titles)
        titles.remove(title)
        test_titles.append(title)
        test_count += len(df[df[ter] == title]) 

    val_titles = titles
    val_count = len(df[df[ter].isin(val_titles)])

    print("Ideal train size is:", ideal_train_size, "whearas the actual is:", train_count)
    print("Ideal val size is:", ideal_val_size, "whearas the actual is:", val_count)
    print("Ideal test size is:", ideal_test_size, "whearas the actual is:", test_count)

    df_trn = df[df[ter].isin(train_titles)]
    df_val = df[df[ter].isin(val_titles)]
    df_tst = df[df[ter].isin(test_titles)]
    
    print("Train class distribution:", df_trn[y].value_counts())
    print("Val class distribution:", df_val[y].value_counts())
    print("Test class distribution:", df_tst[y].value_counts())

    return df_trn[x], df_trn[y], df_val[x], df_val[y], df_tst[x], df_tst[y]

def balance_classes(X_data, Y_data, min_class_value=None):
    
    kes = Y_data.value_counts().keys().tolist()
    
    min_class = Y_data.value_counts().keys()[-1]
    
    if not min_class_value:
        min_class_value = Y_data.value_counts().values[-1]
    
    all_ix = []
            
    for ke in kes:
        ke_indexes = Y_data[Y_data == ke].index.tolist()
        if len(ke_indexes) == min_class_value:
            all_ix += ke_indexes
        else:
            all_ix += list(np.random.choice(ke_indexes, size=min_class_value, replace=False))
        
    X_data = X_data[all_ix]
    Y_data = Y_data[all_ix]
    
    print("Y class distribution:", Y_data.value_counts())
    
    return X_data, Y_data

In [43]:
df['Sentiment'].value_counts()

Neutral     16106
Positive    14302
Negative     5224
Name: Sentiment, dtype: int64

In [39]:
# Unbalanced
X_train, Y_train, X_validate, Y_validate, X_test, Y_test = split_dataset_into_even_class_distributions(X_data=df['Snippet'],
                                                                                                       Y_data=df['Sentiment'], 
                                                                                                       train_size=3000/len(df), 
                                                                                                       test_size=1-(6000/len(df)),
                                                                                                       mini_batch_size=1)

df_trn = pd.DataFrame(np.array([Y_train, X_train]).T, columns=['label', 'text'])
df_val = pd.DataFrame(np.array([Y_validate, X_validate]).T, columns=['label', 'text'])
df_tst = pd.DataFrame(np.array([Y_test, X_test]).T, columns=['label', 'text'])

N_CLASSES = len(Y_train.value_counts())
print(N_CLASSES, "classes")

for size in [100, 300, 1000]:
    df_trn.sample(n=size*N_CLASSES).to_csv(subtask_dir / ("fair_unbalanced_"+str(size)) / "train.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
    df_val.to_csv(subtask_dir / ("fair_unbalanced_"+str(size)) / "validate.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
    df_tst.to_csv(subtask_dir / ("fair_unbalanced_"+str(size)) / "test.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')

Total class distribution: {'Positive': 14302, 'Neutral': 16106, 'Negative': 5224}
Train class distribution: {'Positive': 1204, 'Neutral': 1356, 'Negative': 440}
Validate class distribution: {'Positive': 1204, 'Neutral': 1356, 'Negative': 440}
Test class distribution: {'Positive': 11894, 'Neutral': 13394, 'Negative': 4344}
3 classes


In [60]:
# Balanced
X_train, Y_train, X_validate, Y_validate, X_test, Y_test = split_dataset_into_even_class_distributions(X_data=df['Snippet'],
                                                                                                       Y_data=df['Sentiment'], 
                                                                                                       train_size=.2, 
                                                                                                       test_size=.6,
                                                                                                       mini_batch_size=1)

print(Y_train.value_counts())
print(Y_test.value_counts())

X_validate, Y_validate = balance_classes(X_validate, Y_validate, min_class_value=None)
X_test, Y_test = balance_classes(X_test, Y_test, min_class_value=3000)

df_val = pd.DataFrame(np.array([Y_validate, X_validate]).T, columns=['label', 'text'])
df_tst = pd.DataFrame(np.array([Y_test, X_test]).T, columns=['label', 'text'])

N_CLASSES = len(Y_train.value_counts())
print(N_CLASSES, "classes")

for size in [100, 300, 1000]:
    X_train_sub, Y_train_sub = balance_classes(X_train, Y_train, min_class_value=size)
    df_trn = pd.DataFrame(np.array([Y_train_sub, X_train_sub]).T, columns=['label', 'text'])
    df_trn.to_csv(subtask_dir / ("fair_balanced_"+str(size)) / "train.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
    df_val.to_csv(subtask_dir / ("fair_balanced_"+str(size)) / "validate.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
    df_tst.to_csv(subtask_dir / ("fair_balanced_"+str(size)) / "test.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')

Total class distribution: {'Positive': 14302, 'Neutral': 16106, 'Negative': 5224}
Train class distribution: {'Positive': 2860, 'Neutral': 3221, 'Negative': 1045}
Validate class distribution: {'Positive': 2861, 'Neutral': 3221, 'Negative': 1045}
Test class distribution: {'Positive': 8581, 'Neutral': 9664, 'Negative': 3134}
Neutral     3221
Positive    2860
Negative    1045
Name: Sentiment, dtype: int64
Neutral     9664
Positive    8581
Negative    3134
Name: Sentiment, dtype: int64
Y class distribution: Neutral     1045
Negative    1045
Positive    1045
Name: Sentiment, dtype: int64
Y class distribution: Neutral     3000
Negative    3000
Positive    3000
Name: Sentiment, dtype: int64
3 classes
Y class distribution: Negative    100
Neutral     100
Positive    100
Name: Sentiment, dtype: int64
Y class distribution: Neutral     300
Negative    300
Positive    300
Name: Sentiment, dtype: int64
Y class distribution: Neutral     1000
Negative    1000
Positive    1000
Name: Sentiment, dtype: i

In [65]:
df = pd.read_csv(subtask_dir / ("fair_balanced_"+str(1000)) / "train.csv")
df['label'].value_counts()

Neutral     1000
Negative    1000
Positive    1000
Name: label, dtype: int64

In [66]:
df[df['label'] == 'Negative']['text'].sample(n=10).tolist()

["Harper, you may not think Justin's ready, but urine over your head.",
 'July 27th 2015- the day all the women who find Dean Ambrose attractive died',
 '@Zwelinzima1 do we really have to march every time we have issues to deal with?   Marchers clash at Cosatu rally http://t.co/JN9zwWMp',
 'Call us superstitious...our condo does not have a 13th floor. Some Wiki thoughts on the origin of that tradition: http://t.co/BnGepbeC',
 "@Lrihendry @ChristieC733 what's scary is that it was only 5-4. Like our SCOTUS, Irving may flip on one vote.",
 'Once again Democrats spent all night and this morning trying to talk down the stock market...whatever happens there can be no blank check!',
 'stupid cable took the CW Network away why?it better be back by october 11th if not we are going to have a problem lol #tvd',
 "Embarrassing. Rams cut Alexander. Yes I know he's from Mizzou. But c'mon. Chiefs D makes unknowns look like pro bowlers.",
 'Steve may in fact be the biggest pussy in Big Brother history

In [3]:
#FIXUP

folders = ["fair_balanced_100", "fair_balanced_300", "fair_balanced_1000", "fair_unbalanced_100", "fair_unbalanced_300", "fair_unbalanced_1000"]

for folder in folders:
    files = os.listdir(subtask_dir / folder)
    for file in files:
        df = pd.read_csv(subtask_dir / folder / file)
        print(subtask_dir / folder / file)
        df["label"] = df["label"].replace("negative","Negative").replace("positive", "Positive").replace("neutral", "Neutral")
        print(df['label'].value_counts())
        df.to_csv(subtask_dir / folder / file, index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')

C:\Users\usherwoodpe\Documents\bibliotecas\low_shot_tl\datasets\semeval\Subtask_A\fair_balanced_100\test.csv
Negative    3000
Positive    3000
Neutral     3000
Name: label, dtype: int64
C:\Users\usherwoodpe\Documents\bibliotecas\low_shot_tl\datasets\semeval\Subtask_A\fair_balanced_100\train.csv
Negative    100
Neutral     100
Positive    100
Name: label, dtype: int64
C:\Users\usherwoodpe\Documents\bibliotecas\low_shot_tl\datasets\semeval\Subtask_A\fair_balanced_100\validate.csv
Negative    1046
Positive    1046
Neutral     1046
Name: label, dtype: int64
C:\Users\usherwoodpe\Documents\bibliotecas\low_shot_tl\datasets\semeval\Subtask_A\fair_balanced_300\test.csv
Negative    3000
Positive    3000
Neutral     3000
Name: label, dtype: int64
C:\Users\usherwoodpe\Documents\bibliotecas\low_shot_tl\datasets\semeval\Subtask_A\fair_balanced_300\train.csv
Positive    300
Negative    300
Neutral     300
Name: label, dtype: int64
C:\Users\usherwoodpe\Documents\bibliotecas\low_shot_tl\datasets\semeva

In [22]:
df_trn[df_trn['label'] == 'Negative']['text'].sample(n=10).tolist()

[nan,
 'Project X is the best film ever made and someone with money needs to have one, like this Friday. #makeithappen',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'Just found out we have 91 Appointments n we are shooting for 120 on Monday - Huntsville, AL are you ready for us?!?!  #NMAE @lancomeparis',
 nan]