In [2]:
from typing import List

import pandas as pd

%load_ext autoreload
%autoreload 2

In [3]:
# export
def parse_file(fname: str) -> List[str]:
    with open(fname, "r") as f:
        contents = f.readlines()

    lines = []
    line = ""
    for c in contents:
        line += c
        if c == "\n":
            line_dict = parse_string(line)
            lines.append(line_dict)
            line = ""

    return pd.DataFrame(lines)

In [4]:
# export
def parse_string(line: str):
    line_values = line.split("\n")
    meta, uid, sentiment = line_values[0].split("\t")
    words = []
    for line_content in line_values[1:]:
        words.append(line_content.split("\t")[0])
    words = " ".join(words)

    line_dict = {"uid": uid, "sentiment": sentiment, "text": words}
    return line_dict

# Valid Data

In [6]:
valid_df = parse_file("data/raw/dev_3k_split_conll.txt"); len(valid_df)

3000

In [7]:
valid_df.head()

Unnamed: 0,sentiment,text,uid
0,positive,@ prahladspatel modi mantrimandal may samil ho...,30258
1,negative,@ bkunalraj @ TajinderBagga @ NikhilJakhar14 @...,16648
2,negative,@ waglenikhil U saw caste and religion in them...,28511
3,neutral,@ DelhiPolice sir local police station pe comp...,10466
4,positive,Ve Maahi song from # Kesari is current favouri...,19266


In [9]:
valid_df.to_json("data/interim/valid.json", orient="records")

In [11]:
valid_df.describe()

Unnamed: 0,sentiment,text,uid
count,3000,3000,3000
unique,3,3000,3000
top,neutral,@ idreesAlzeyadi @ MaryamNSharif AUR HAAN PAK ...,29819
freq,1128,1,1


# Trial Data

In [4]:
trial_df = parse_file("trial.txt"); len(trial_df)

1869

In [5]:
trial_df.head()

Unnamed: 0,uid,sentiment,text
0,8,neutral,RT @ UAAPconfessions Love looks good on Maddie...
1,12,neutral,Ye Ye ..... ye ??????? We gonna start another ...
2,14,neutral,@ zWffFY9JGklElA1 @ Min _ Of _ Lyching @ thaku...
3,23,negative,~ Caring . ~ Bohot Jyada Caring . ~ Courier wa...
4,24,positive,@ AliHZaidiPTI @ SarfarazA _ 54 What nonesense...


In [6]:
trial_df.to_json("trial.json", orient="records")

## Train
### Previous was Trial Text

In [7]:
train_df = parse_file("train.txt"); len(train_df)

15131

In [8]:
train_df.head()

Unnamed: 0,uid,sentiment,text
0,3,negative,@ AdilNisarButt pakistan ka ghra tauq he Pakis...
1,41,negative,Madarchod mulle ye mathura me Nahi dikha tha j...
2,48,positive,@ narendramodi Manya Pradhan Mantri mahoday Sh...
3,64,positive,@ Atheist _ Krishna Jcb full trend me chal rah...
4,66,positive,@ AbhisharSharma _ @ RavishKumarBlog Loksabha ...


In [9]:
train_df.describe()

Unnamed: 0,uid,sentiment,text
count,15131,15131,15131
unique,15131,3,15131
top,20202,neutral,@ pahaadkhan @ SureshChavhanke Musalman deshbh...
freq,1,5638,1


In [10]:
train_df.to_json("train.json", orient="records")

# Create Train Large

In [2]:
import pandas as pd

In [3]:
trial = pd.read_json("trial.json")
train = pd.read_json("train.json")
trial.head(), train.head()

(   uid sentiment                                               text
 0    8   neutral  RT @ UAAPconfessions Love looks good on Maddie...
 1   12   neutral  Ye Ye ..... ye ??????? We gonna start another ...
 2   14   neutral  @ zWffFY9JGklElA1 @ Min _ Of _ Lyching @ thaku...
 3   23  negative  ~ Caring . ~ Bohot Jyada Caring . ~ Courier wa...
 4   24  positive  @ AliHZaidiPTI @ SarfarazA _ 54 What nonesense...,
    uid sentiment                                               text
 0    3  negative  @ AdilNisarButt pakistan ka ghra tauq he Pakis...
 1   41  negative  Madarchod mulle ye mathura me Nahi dikha tha j...
 2   48  positive  @ narendramodi Manya Pradhan Mantri mahoday Sh...
 3   64  positive  @ Atheist _ Krishna Jcb full trend me chal rah...
 4   66  positive  @ AbhisharSharma _ @ RavishKumarBlog Loksabha ...)

In [4]:
df = pd.concat([trial, train])
len(df)

17000

In [5]:
df.to_json("train-large.json", orient="records")