In [7]:
import pandas as pd
import gzip
from sklearn.model_selection import train_test_split
import os
from tqdm.auto import tqdm

In [8]:

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)


def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1

    return pd.DataFrame.from_dict(df, orient='index')

In [10]:

categories = [category[3:] for category in os.listdir("raw_data") if category.endswith(".gz") and category.startswith("qa")]

for category in tqdm(categories):
    if not os.path.isfile(f"raw_data/{category.split('.')[0]}.tsv"):
        try:
            df1 = getDF(f'raw_data/qa_{category}')
            df2 = getDF(f'raw_data/meta_{category}')

            df = pd.merge(df1, df2, on="asin", how="left")
            df = df[["question", "answer", "description"]]
            df = df.dropna()
            df = df.drop_duplicates(subset="answer")
            print(df.head())

            df.to_csv(f"data/{category.split('.')[0]}.tsv", "\t")
        except:
            pass

df = pd.concat((pd.read_csv(f"data/{f}", sep="\t") for f in os.listdir("data") if f.endswith(".tsv")))
df = df[["question", "description"]]
df["description"] = df["description"].apply(lambda x: x[2:-2])
df.columns = ["target_text", "input_text"]
df["prefix"] = "ask_question"

df.to_csv(f"data/data_all.tsv", "\t")

train_df, eval_df = train_test_split(df, test_size=0.05)



  0%|          | 0/3 [00:00<?, ?it/s]

                                             question  \
46                                  filter for vicks3   
48  I have not purchased the humidifier that uses ...   
50  I need to replace a Holmes wick HWF-65T. Does ...   
52  will this be replacement for holmes model hm 1...   
54  I would like to buy at least 5 filters. Could ...   

                                               answer  \
46  Filter for vicks3 is NOT a question. What are ...   
48  The outer shell will fall apart in about a yea...   
50  The only humidifiers that this will fit are Su...   
52  Its generic and does not work in a Holmes humi...   
54  Hi: yes, we would bundle all 5 filters and cha...   

                                          description  
46  [Keep your air humidifier operating at peak ef...  
48  [Keep your air humidifier operating at peak ef...  
50  [Keep your air humidifier operating at peak ef...  
52  [Keep your air humidifier operating at peak ef...  
54  [Keep your air humidifier oper

In [12]:
train_df.reset_index(drop=True,inplace=True)
eval_df.reset_index(drop=True,inplace=True)

In [13]:
train_df.head()

Unnamed: 0,target_text,input_text,prefix
0,will lint screen filter 339392 fit lint screen...,This is an O.E.M authorized part. Fits various...,ask_question
1,Can you cut Laminated Paper with this Machine?,The Cricut Expression Personal Electronic Cutt...,ask_question
2,Do these bobbins fit the 750 Golden Touch and sew,FITS: Singer 1000G Singer 1040G Singer 1060 Si...,ask_question
3,Does 100PCS mean 100 bails for 9.99 ?,"Bail approx 21mm in total,heart 10mm,Bright si...",ask_question
4,Does anyone know of a comparable quality pen w...,Kuretake No. 8 Fountain Brush Pen,ask_question


In [14]:
train_df.to_csv("data/train_df.tsv", "\t")
eval_df.to_csv("data/eval_df.tsv", "\t")