# Training - Cornell Dataset

## Imports

In [26]:
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
import numpy as np
import os, shutil
import nltk
import sys
import re

## Converting the Cornell Dataset into DataFrames

The codes in this subsection only needs to be run once, as the data has already been combined into 1 csv file and is saved. Hence, the next time round, this section can just be skipped and the .csv file can just be loaded in directly.

In [27]:
def create_df(tag):
    DIR = os.path.join("review_polarity/txt_sentoken/", tag)
    contents = os.listdir(DIR)

    text = []

    if ".DS_Store" in contents:
        contents.remove(".DS_Store")

    for file in contents:
        file_path = os.path.join(DIR, file)
        with open(file_path) as f:
            lines = f.read()
        lines = lines.replace("\n", " ")
        text.append(lines)
    
    num_texts = len(contents)
    label = [tag] * num_texts

    if len(text) == len(label):
        print("INFO: Correct Length of Text and Label!")
    else:
        print(f"ERROR: Mismatch of Lengths Text : {len(text)}, Label : {len(label)}")
        sys.exit(1)

    data_dict = {"text" : text, "sentiment" : label}

    data_df = pd.DataFrame(data_dict)
    return data_df

In [28]:
pos_df = create_df('pos')
neg_df = create_df('neg')
pos_df.head()

INFO: Correct Length of Text and Label!
INFO: Correct Length of Text and Label!


Unnamed: 0,text,sentiment
0,assume nothing . the phrase is perhaps one of...,pos
1,plot : derek zoolander is a male model . he i...,pos
2,i actually am a fan of the original 1961 or so...,pos
3,a movie that's been as highly built up as the ...,pos
4,""" good will hunting "" is two movies in one : ...",pos


In [29]:
neg_df.head()

Unnamed: 0,text,sentiment
0,bad . bad . bad . that one word seems to pre...,neg
1,isn't it the ultimate sign of a movie's cinema...,neg
2,""" gordy "" is not a movie , it is a 90-minute-...",neg
3,disconnect the phone line . don't accept the ...,neg
4,when robert forster found himself famous again...,neg


### Concatenating the DataFrames

In [30]:
df = pd.concat([pos_df, neg_df], ignore_index=True)
df

Unnamed: 0,text,sentiment
0,assume nothing . the phrase is perhaps one of...,pos
1,plot : derek zoolander is a male model . he i...,pos
2,i actually am a fan of the original 1961 or so...,pos
3,a movie that's been as highly built up as the ...,pos
4,""" good will hunting "" is two movies in one : ...",pos
...,...,...
1995,synopsis : when a meteorite crashlands in the ...,neg
1996,it's now the anniversary of the slayings of ju...,neg
1997,coinciding with the emerging popularity of mov...,neg
1998,and now the high-flying hong kong style of fil...,neg


### Writing the DataFrame into a csv

In [34]:
df.to_csv('cornell_polarity_combined.csv')

## Exploratory Data Analysis

In [32]:
print("Number of texts in each of the Classes: ")
df['sentiment'].value_counts()

Number of texts in each of the Classes: 


pos    1000
neg    1000
Name: sentiment, dtype: int64

By looking at the value counts, we can see that the dataset is balanced and there is an equal distribution between Positive and Negative sentiments.