In [18]:
import pandas as pd
import numpy as np
import math

DATASET_PATH = './IMBD.csv'

In [19]:
df = pd.read_csv(DATASET_PATH)

In [20]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [21]:
def convert_binary(sentiment):
    if sentiment == "positive":
        return 1
    else:
        return 0

In [22]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [23]:
test_ratio = 0.20
n_chunks = math.ceil(1 / test_ratio)

In [24]:
chunks = np.array_split(df, n_chunks)
test_df = chunks[0]
train_chunks = chunks[1:]

test_df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive
9996,Give me a break. How can anyone say that this ...,negative
9997,This movie is a bad movie. But after watching ...,negative
9998,This is a movie that was probably made to ente...,negative


In [25]:
train_df = pd.concat(train_chunks)
train_df

Unnamed: 0,review,sentiment
10000,""" While sporadically engrossing (including a f...",negative
10001,"A French novelist, disgusted by his wife's soc...",positive
10002,"It must be remembered that the Gammera movies,...",negative
10003,"I gave this film 8 out of 10, reserving 10 for...",positive
10004,"Cheap, gloriously bad cheese from the 80's, th...",negative
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [26]:
print("TRAIN\n", train_df['sentiment'].value_counts())
print("\nTEST\n", test_df['sentiment'].value_counts())

TRAIN
 negative    20028
positive    19972
Name: sentiment, dtype: int64

TEST
 positive    5028
negative    4972
Name: sentiment, dtype: int64


In [27]:
test_df_no_sentiment = test_df.drop('sentiment', axis=1)
test_df_no_sentiment

Unnamed: 0,review
0,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...
2,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is..."
...,...
9995,"Fun, entertaining movie about WWII German spy ..."
9996,Give me a break. How can anyone say that this ...
9997,This movie is a bad movie. But after watching ...
9998,This is a movie that was probably made to ente...


In [28]:
def replace_sentiment_labels(df: pd.DataFrame):
    df['sentiment'] = df['sentiment'].map(convert_binary)

In [29]:
replace_sentiment_labels(train_df)
replace_sentiment_labels(test_df)
test_df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",1
9996,Give me a break. How can anyone say that this ...,0
9997,This movie is a bad movie. But after watching ...,0
9998,This is a movie that was probably made to ente...,0


In [30]:
train_df.to_csv('./train.csv')
test_df.to_csv('./test_solutions.csv')
test_df_no_sentiment.to_csv('./test.csv')

In [31]:
predictions = [1] * len(test_df.index)
sample_df = pd.DataFrame(predictions, columns=['sentiment'])
sample_df

Unnamed: 0,sentiment
0,1
1,1
2,1
3,1
4,1
...,...
9995,1
9996,1
9997,1
9998,1


In [32]:
sample_df.to_csv('sample_submission.csv')