In [3]:
import os
import requests

os.makedirs("data", exist_ok=True)

datasets = {
    "train.tsv": "https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/train.tsv",
    "test.tsv": "https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/test.tsv",
    "imdb.csv": "https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv",
}

def download_file(filename, url):
    filepath = os.path.join("data", filename)
    if not os.path.exists(filepath):
        response = requests.get(url)
        response.raise_for_status()
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(response.text)
        print(f"{filename} downloaded.")
    else:
        print(f"{filename} already exists.")

for filename, url in datasets.items():
    download_file(filename, url)

train.tsv already exists.
test.tsv already exists.
imdb.csv already exists.


In [4]:
for file in os.listdir('data'):
    with open(f'data/{file}', 'r') as f:
        print(f'First few characters of {file}: {f.read()[:150]}')
        f.seek(0)
        print(f'Length of {file}: {len(f.read())}')
        print('\n')

First few characters of imdb.csv: review,sentiment
"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly 
Length of imdb.csv: 66200352


First few characters of imdb_split.csv: text,label
"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what h
Length of imdb_split.csv: 65850346


First few characters of test.tsv: no movement , no yuks , not much of anything	0
a gob of drivel so sickly sweet , even the eager consumers of moore 's pasteurized ditties will retch i
Length of test.tsv: 188749


First few characters of test_imdb_split.csv: text,label
"I really liked this Summerslam due to the look of the arena, the curtains and just the look overall was interesting to me for some reason.
Length of test_imdb_split.csv: 13211160


First few characters of train.tsv: a stirring , funny and finally transporting re imagining of beauty an

In [5]:
import pandas as pd

train_data_df = pd.read_csv('data/train.tsv', sep='\t', header=None, names=['text','label'])
train_data_df['label'] = train_data_df['label'].astype(int)
display(train_data_df.head())
# 0 is negative, 1 is positive

Unnamed: 0,text,label
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [6]:
print(train_data_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6920 entries, 0 to 6919
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    6920 non-null   object
 1   label   6920 non-null   int32 
dtypes: int32(1), object(1)
memory usage: 81.2+ KB
None


In [7]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(train_data_df, test_size=0.2, random_state=42)
print(train_df.shape, val_df.shape)

(5536, 2) (1384, 2)


In [8]:
test_df = pd.read_csv('data/test.tsv', sep='\t', header=None, names=['text','label'])
test_df['label'] = test_df['label'].astype(int)
display(test_df.head())
print(test_df.shape)

Unnamed: 0,text,label
0,"no movement , no yuks , not much of anything",0
1,"a gob of drivel so sickly sweet , even the eag...",0
2,"gangs of new york is an unapologetic mess , wh...",0
3,"we never really feel involved with the story ,...",0
4,this is one of polanski 's best films,1


(1821, 2)


In [9]:
train_df.to_csv("data/train_split.tsv", sep="\t", index=False)
val_df.to_csv("data/val_split.tsv", sep="\t", index=False)

print("train and validation splits saved")

train and validation splits saved


In [10]:
print('\n',train_df['label'].value_counts())
print('\n',val_df['label'].value_counts())
print('\n',test_df['label'].value_counts())


 label
1    2897
0    2639
Name: count, dtype: int64

 label
1    713
0    671
Name: count, dtype: int64

 label
0    912
1    909
Name: count, dtype: int64


In [11]:
import pandas as pd

imdb_df = pd.read_csv('data/imdb.csv')
imdb_df = imdb_df.rename(columns={'review':'text', 'sentiment':'label'})
display(imdb_df.head())

Unnamed: 0,text,label
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [12]:
imdb_df['label'] = imdb_df['label'].apply(lambda x: 1 if x=='positive' else 0)
display(imdb_df.head())

Unnamed: 0,text,label
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [13]:
from sklearn.model_selection import train_test_split
 
# split imdb data into train val and split and save
train_imdb_df, test_imdb_df = train_test_split(imdb_df, test_size=0.2, random_state=42)
train_imdb_df , val_imdb_df = train_test_split(train_imdb_df, test_size=0.2, random_state=42)

print(train_imdb_df.shape, val_imdb_df.shape, test_imdb_df.shape)

train_imdb_df.to_csv("data/train_imdb_split.csv", index=False)
val_imdb_df.to_csv("data/val_imdb_split.csv", index=False)
test_imdb_df.to_csv("data/test_imdb_split.csv", index=False)

(32000, 2) (8000, 2) (10000, 2)


In [16]:
print('\n',train_imdb_df['label'].value_counts())
print('\n',val_imdb_df['label'].value_counts())
print('\n',test_imdb_df['label'].value_counts())


 label
0    16080
1    15920
Name: count, dtype: int64

 label
1    4041
0    3959
Name: count, dtype: int64

 label
1    5039
0    4961
Name: count, dtype: int64


In [14]:
display(train_imdb_df.head())

Unnamed: 0,text,label
11794,With no fault to the actors (they all put on g...,0
24925,The first thing I thought when I saw this film...,1
28578,Post-feminist depiction of cruelty and sadism....,1
13987,OMG this is one of the worst films iv ever see...,0
7693,"The Box is a film with great potential, but th...",0


In [15]:
imdb_df.to_csv("data/imdb_split.csv", index=False)
print("imdb split saved")

imdb split saved
