In [1]:
import os
import pandas as pd
from tqdm import tqdm

In [2]:
def read_texts(path, target):
    """
    Takes an input path and reads the txt files 
    All outputs rewitten as pandas dataframes with review and sentiment as columns
    Text sentiment classes are already known
    """

    df = pd.DataFrame(columns = ['review', 'sentiment'])

    for dirname, dirnames, filenames in os.walk(path):
    # print path to all subdirectories first.

        for filename in tqdm(filenames,total = len(filenames)):      
            file = os.path.join(dirname, filename)
        
            with open(file, 'r', encoding="utf8") as f:
                rev = f.readlines()
                df = df.append({'review': rev[0], 'sentiment': target}, ignore_index=True)
            
    return df

In [3]:
df_trainpos = read_texts(path='data/train/pos/', target='positive')
df_trainneg = read_texts(path='data/train/neg/', target='negative')

100%|██████████| 12500/12500 [00:41<00:00, 303.95it/s]
100%|██████████| 12500/12500 [00:41<00:00, 302.49it/s]


In [6]:
print(df_trainpos.shape, df_trainneg.shape)

(12500, 2) (12500, 2)


In [8]:
df_testpos = read_texts(path='data/test/pos/', target='positive')
df_testneg = read_texts(path='data/test/neg/', target='negative')
print(df_testpos.shape, df_testneg.shape)

100%|██████████| 12500/12500 [00:34<00:00, 365.81it/s]
100%|██████████| 12500/12500 [00:37<00:00, 332.97it/s]

(12500, 2) (12500, 2)





In [21]:
# join dfs
trainframes = [df_trainpos, df_trainneg]
testframes = [df_testpos, df_testneg]

df_train= pd.concat(trainframes)
df_test = pd.concat(testframes)

In [22]:
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)
df_test.head()

Unnamed: 0,review,sentiment
0,Just finished this impressively nutty affair a...,positive
1,"The Big Knife, a movie about the dark side of ...",negative
2,"I saw this film in a sneak preview, and it is ...",positive
3,I bought this movie from a market stall three ...,negative
4,The film moves along quite well but the acting...,negative


In [23]:
# Testing samples 
df_unsup = read_texts(path='data/train/unsup/', target='unsup')
df_unsup.head()

100%|██████████| 50000/50000 [04:46<00:00, 174.75it/s]


Unnamed: 0,review,sentiment
0,"I admit, the great majority of films released ...",unsup
1,"Take a low budget, inexperienced actors doubli...",unsup
2,"Everybody has seen 'Back To The Future,' right...",unsup
3,Doris Day was an icon of beauty in singing and...,unsup
4,"After a series of silly, fun-loving movies, 19...",unsup


In [30]:
df_unsup.to_csv("data/testset.csv", sep='\t')