In [1]:
from typing import List

import pandas as pd

%load_ext autoreload
%autoreload 2

In [2]:
# export
def parse_file(fname: str) -> List[str]:
    with open(fname, "r") as f:
        contents = f.readlines()

    lines = []
    line = ""
    for c in contents:
        line += c
        if c == "\n":
            line_dict = parse_string(line)
            lines.append(line_dict)
            line = ""

    return pd.DataFrame(lines)

In [4]:
# export
def parse_string(line: str):
    line_values = line.split("\n")
    meta, uid, sentiment = line_values[0].split("\t")
    words = []
    for line_content in line_values[1:]:
        words.append(line_content.split("\t")[0])
    words = " ".join(words)

    line_dict = {"uid": uid, "sentiment": sentiment, "text": words}
    return line_dict

In [None]:
from pathlib import Path
datapath = Path("../data")
data_raw = datapath/"raw"
data_interim = datapath/"interim"
data_processed = datapath/"processed"
cleanlab_datapath = datapath/"cleanlab"

# Valid Data

In [5]:
valid_df = parse_file(data_raw/"dev_3k_split_conll.txt")
len(valid_df)

3000

In [6]:
valid_df.head()

Unnamed: 0,uid,sentiment,text
0,30258,positive,@ prahladspatel modi mantrimandal may samil ho...
1,16648,negative,@ bkunalraj @ TajinderBagga @ NikhilJakhar14 @...
2,28511,negative,@ waglenikhil U saw caste and religion in them...
3,10466,neutral,@ DelhiPolice sir local police station pe comp...
4,19266,positive,Ve Maahi song from # Kesari is current favouri...


In [8]:
valid_df.to_json(data_interim/"valid.json", orient="records")

In [9]:
valid_df.describe()

Unnamed: 0,uid,sentiment,text
count,3000,3000,3000
unique,3000,3,3000
top,22131,neutral,@ RanveerOfficial @ XiaomiIndia @ RedmiIndia D...
freq,1,1128,1


# Trial Data

In [10]:
trial_df = parse_file(data_raw/"trial.txt"); len(trial_df)

1869

In [11]:
trial_df.head()

Unnamed: 0,uid,sentiment,text
0,8,neutral,RT @ UAAPconfessions Love looks good on Maddie...
1,12,neutral,Ye Ye ..... ye ??????? We gonna start another ...
2,14,neutral,@ zWffFY9JGklElA1 @ Min _ Of _ Lyching @ thaku...
3,23,negative,~ Caring . ~ Bohot Jyada Caring . ~ Courier wa...
4,24,positive,@ AliHZaidiPTI @ SarfarazA _ 54 What nonesense...


In [12]:
trial_df.to_json(data_raw/"trial.json", orient="records")

## Train
### Previous was Trial Text

In [13]:
train_df = parse_file(data_raw/"train.txt"); len(train_df)

15131

In [14]:
train_df.head()

Unnamed: 0,uid,sentiment,text
0,3,negative,@ AdilNisarButt pakistan ka ghra tauq he Pakis...
1,41,negative,Madarchod mulle ye mathura me Nahi dikha tha j...
2,48,positive,@ narendramodi Manya Pradhan Mantri mahoday Sh...
3,64,positive,@ Atheist _ Krishna Jcb full trend me chal rah...
4,66,positive,@ AbhisharSharma _ @ RavishKumarBlog Loksabha ...


In [15]:
train_df.describe()

Unnamed: 0,uid,sentiment,text
count,15131,15131,15131
unique,15131,3,15131
top,17251,neutral,@ AshrafFem Kisi ki bhi hatya hona bahut galat...
freq,1,5638,1


In [16]:
train_df.to_json(data_raw/"train.json", orient="records")

# Create Train Large

In [17]:
import pandas as pd

In [20]:
trial = pd.read_json(data_raw/"trial.json")
train = pd.read_json(data_raw/"train.json")
trial.head(), train.head()

(   uid sentiment                                               text
 0    8   neutral  RT @ UAAPconfessions Love looks good on Maddie...
 1   12   neutral  Ye Ye ..... ye ??????? We gonna start another ...
 2   14   neutral  @ zWffFY9JGklElA1 @ Min _ Of _ Lyching @ thaku...
 3   23  negative  ~ Caring . ~ Bohot Jyada Caring . ~ Courier wa...
 4   24  positive  @ AliHZaidiPTI @ SarfarazA _ 54 What nonesense...,
    uid sentiment                                               text
 0    3  negative  @ AdilNisarButt pakistan ka ghra tauq he Pakis...
 1   41  negative  Madarchod mulle ye mathura me Nahi dikha tha j...
 2   48  positive  @ narendramodi Manya Pradhan Mantri mahoday Sh...
 3   64  positive  @ Atheist _ Krishna Jcb full trend me chal rah...
 4   66  positive  @ AbhisharSharma _ @ RavishKumarBlog Loksabha ...)

In [21]:
df = pd.concat([trial, train])
len(df)

17000

In [22]:
df.to_json(data_interim/"train-large.json", orient="records")

# Test Data

In [23]:
# export
def parse_file_test(fname: str) -> List[str]:
    with open(fname, "r") as f:
        contents = f.readlines()

    lines = []
    line = ""
    for c in contents:
        line += c
        if c == "\n":
            line_dict = parse_string_test(line)
            lines.append(line_dict)
            line = ""

    return pd.DataFrame(lines)

In [24]:
# export
def parse_string_test(line: str):
    line_values = line.split("\n")
    meta, uid = line_values[0].split("\t")
    words = []
    for line_content in line_values[1:]:
        words.append(line_content.split("\t")[0])
    words = " ".join(words)

    line_dict = {"uid": uid, "text": words}
    return line_dict

In [25]:
test_df = parse_file_test(data_raw/"test.txt")
len(test_df)

2999

In [26]:
test_df.head()

Unnamed: 0,uid,text
0,20803,@ 454dkhan @ Heisunberg _ Agr kse ko itni impo...
1,20187,logon ko alloo pyaz tomator me toh allah pak k...
2,20953,@ LambaAlka Wafadaar bane rahane ka nayab tari...
3,13777,@ varnishant @ narendramodi Chup bhosdike . He...
4,20990,RT @ HardeepSPuri à¨¦à¨¾à¨¤à¨¾ à¨¸à©‹ à¨¸à¨¾à¨...


In [27]:
test_df.to_json(data_interim/"final_test.json", orient="records")