# Merging the Datasets to form the Training Set

## Imports

In [1]:
import pandas as pd
import numpy as np
import os

## Paths

In [2]:
os.getcwd()

'/Users/sankeerthana/Documents/NTU/YEAR_4/CZ4045/Group_Assignment/NLP-text-classification-CNN/processing'

In [3]:
#Initialising the paths
os.chdir('..')

PARENT_DIR = os.getcwd()
PROCESSING_DIR = 'processing'
DATASETS_DIR = 'datasets'
INITIAL_DIR = os.path.join(DATASETS_DIR,'initial')

## Loading the Datasets

### Loading the Objectivity VS Subjectivity Data [Benchmark]

complete_subjectivity.csv taken from Cornell Dataset 

• 0 - Neutral/Objectivity

• 1 - Subjectivity

In [13]:
subj = pd.read_csv(os.path.join(INITIAL_DIR,'complete_subjectivity.csv'))
subj.head()

Unnamed: 0,text,subjectivity
0,"smart and alert , thirteen conversations about...",1
1,"color , musical bounce and warm seas lapping o...",1
2,it is not a mass-market entertainment but an u...,1
3,a light-hearted french film about the spiritua...,1
4,my wife is an actress has its moments in looki...,1


#### Observing the Distribution of Classes

In [14]:
subj['subjectivity'].value_counts()

1    5000
0    5000
Name: subjectivity, dtype: int64

### Loading the Cornell Polarity Dataset [Benchmark]

CR_balanced.csv taken from the Cornell Dataset.

• 0 - Negative

• 1 - Positive

In [35]:
cornell_polarity = pd.read_csv(os.path.join(INITIAL_DIR,'CR_balanced.csv'))
cornell_polarity.head()

Unnamed: 0,text,polarity
0,weaknesses are minor : the feel and layout of ...,0
1,many of our disney movies do n 't play on this...,0
2,player has a problem with dual-layer dvd 's su...,0
3,"i know the saying is "" you get what you pay fo...",0
4,will never purchase apex again .,0


#### Observing Distribution of Classes

In [8]:
cornell_polarity['polarity'].value_counts()

0    1366
1    1366
Name: polarity, dtype: int64

### Loading the Polarity Dataset [Crawled]

In [49]:
crawled_polarity = pd.read_csv(os.path.join(INITIAL_DIR,'100_pos_neg_from_combined.csv'))
crawled_polarity.head()

Unnamed: 0,title,verdict
0,Best football play of the year,2
1,"Babyâs first laugh makes the mama cry, what ...",2
2,[OC] A little kitty drawn by my local barrista...,2
3,"My wife (who in 10 years, Iâve never known t...",2
4,What a lucky stepson!,2


#### Observing Distribution of Classes

In [50]:
crawled_polarity['verdict'].value_counts()

2    100
0    100
Name: verdict, dtype: int64

## Loading the Emotion Balanced Dataset

In [6]:
emotion = pd.read_csv(os.path.join(INITIAL_DIR,'emotion_balanced.csv'))
emotion.head()

Unnamed: 0.1,Unnamed: 0,text,polarity
0,3,i am ever feeling nostalgic about the fireplac...,POSITIVE
1,6,ive been taking or milligrams or times recomme...,POSITIVE
2,8,i have been with petronas for years i feel tha...,POSITIVE
3,9,i feel romantic too,POSITIVE
4,11,i do feel that running is a divine experience ...,POSITIVE


In [7]:
emotion.drop(columns=['Unnamed: 0'], inplace=True)

In [8]:
emotion.head()

Unnamed: 0,text,polarity
0,i am ever feeling nostalgic about the fireplac...,POSITIVE
1,ive been taking or milligrams or times recomme...,POSITIVE
2,i have been with petronas for years i feel tha...,POSITIVE
3,i feel romantic too,POSITIVE
4,i do feel that running is a divine experience ...,POSITIVE


In [10]:
emotion['polarity'].value_counts()

POSITIVE    7000
NEGATIVE    7000
Name: polarity, dtype: int64

## Observations 

Overall the distribution of the classes is as follows:

1. Neutral = 5000 = 5000
2. Negative = 1366 + 100 = 1466
3. Positive = 1366 + 100 = 1466

As observed, there is a clear class imbalance towards the Neutral Class, this could result in the model overfitting and predicting every unsure sample as neutral, hence, to avoid that, we try to keep the neutral class around the same number as positive and negative classes, but slightly higher. Hence, the revised number would be:

1. Neutral = 1500 --> will be taking only 1500 from the cornell subjectivity set
2. Negative = 1366 + 100 = 1466
3. Positive = 1366 + 100 = 1466

Total = 1500 + 1466 + 1466 = **4432**


## Preparing the Required Data

### Changing the labels accordingly to avoid confusion 

#### Cornell Polarity

In [38]:
# Cornell Polarity
cornell_polarity['polarity'] = cornell_polarity['polarity'].replace({0:'neg', 1:'pos'})


In [39]:
cornell_polarity.head()

Unnamed: 0,text,polarity
0,weaknesses are minor : the feel and layout of ...,neg
1,many of our disney movies do n 't play on this...,neg
2,player has a problem with dual-layer dvd 's su...,neg
3,"i know the saying is "" you get what you pay fo...",neg
4,will never purchase apex again .,neg


In [41]:
cornell_polarity['polarity'].value_counts()

neg    1366
pos    1366
Name: polarity, dtype: int64

In [56]:
#rename the column names 
cornell_polarity.rename(columns={'text':'text', 'polarity':'sentiment'}, inplace=True)
cornell_polarity.head()

Unnamed: 0,text,sentiment
0,weaknesses are minor : the feel and layout of ...,neg
1,many of our disney movies do n 't play on this...,neg
2,player has a problem with dual-layer dvd 's su...,neg
3,"i know the saying is "" you get what you pay fo...",neg
4,will never purchase apex again .,neg


#### Complete Subjectivity

In [15]:
subj['subjectivity'] = subj['subjectivity'].replace({0:'neut', 1:'subj'})

In [16]:
subj['subjectivity'].value_counts()

subj    5000
neut    5000
Name: subjectivity, dtype: int64

In [17]:
subj.head()

Unnamed: 0,text,subjectivity
0,"smart and alert , thirteen conversations about...",subj
1,"color , musical bounce and warm seas lapping o...",subj
2,it is not a mass-market entertainment but an u...,subj
3,a light-hearted french film about the spiritua...,subj
4,my wife is an actress has its moments in looki...,subj


In [18]:
#rename the column names 
subj.rename(columns={'text':'text', 'subjectivity':'sentiment'}, inplace=True)
subj.head()

Unnamed: 0,text,sentiment
0,"smart and alert , thirteen conversations about...",subj
1,"color , musical bounce and warm seas lapping o...",subj
2,it is not a mass-market entertainment but an u...,subj
3,a light-hearted french film about the spiritua...,subj
4,my wife is an actress has its moments in looki...,subj


#### Extra Crawled Data 

100_pos_neg_from_combined.csv

In [61]:
crawled_polarity['verdict'] = crawled_polarity['verdict'].replace({2:'pos', 0:'neg'})
crawled_polarity['verdict'].value_counts()

pos    100
neg    100
Name: verdict, dtype: int64

In [63]:
#renaming the columns
crawled_polarity.rename(columns={'title':'text', 'verdict':'sentiment'}, inplace=True)
crawled_polarity.head()

Unnamed: 0,text,sentiment
0,Best football play of the year,pos
1,"Babyâs first laugh makes the mama cry, what ...",pos
2,[OC] A little kitty drawn by my local barrista...,pos
3,"My wife (who in 10 years, Iâve never known t...",pos
4,What a lucky stepson!,pos


In [25]:
emotion['polarity'] = emotion['polarity'].replace({'POSITIVE':'pos', 'NEGATIVE':'neg'})
emotion.rename(columns={'text':'text', 'polarity':'sentiment'}, inplace=True)
emotion.head()

Unnamed: 0,text,sentiment
0,i am ever feeling nostalgic about the fireplac...,pos
1,ive been taking or milligrams or times recomme...,pos
2,i have been with petronas for years i feel tha...,pos
3,i feel romantic too,pos
4,i do feel that running is a divine experience ...,pos


### Extracting out only 1500 Neutral - Randomly

In [26]:
#extractign out only the neutral values from cornell subjectivity dataset
cornell_neutral = subj[subj['sentiment']=='neut']
cornell_neutral['sentiment'].value_counts()

neut    5000
Name: sentiment, dtype: int64

In [27]:
#randomly sampling
cornell_1500_neutral = cornell_neutral.sample(n=1500)
cornell_1500_neutral['sentiment'].value_counts()

neut    1500
Name: sentiment, dtype: int64

In [28]:
cornell_1500_neutral.to_csv('cornell_1500_neutral.csv', index=False)

### Merging the Crawled and Benchmark Polarity Sets

In [69]:
cornell_polarity.head()

Unnamed: 0,text,sentiment
0,weaknesses are minor : the feel and layout of ...,neg
1,many of our disney movies do n 't play on this...,neg
2,player has a problem with dual-layer dvd 's su...,neg
3,"i know the saying is "" you get what you pay fo...",neg
4,will never purchase apex again .,neg


In [70]:
crawled_polarity.head()

Unnamed: 0,text,sentiment
0,Best football play of the year,pos
1,"Babyâs first laugh makes the mama cry, what ...",pos
2,[OC] A little kitty drawn by my local barrista...,pos
3,"My wife (who in 10 years, Iâve never known t...",pos
4,What a lucky stepson!,pos


In [72]:
#Concatenating the both
pos_neg_set = pd.concat([cornell_polarity, crawled_polarity])
pos_neg_set['sentiment'].value_counts()

neg    1466
pos    1466
Name: sentiment, dtype: int64

### Merging Neutral to Pos-Neg Set - FINAL SET

In [29]:
training_set = pd.concat([pos_neg_set,cornell_1500_neutral])
training_set['sentiment'].value_counts()

NameError: name 'pos_neg_set' is not defined

In [36]:
training_set = pd.concat([emotion,cornell_neutral])
training_set['sentiment'].value_counts()

pos     7000
neg     7000
neut    5000
Name: sentiment, dtype: int64

In [37]:
training_set.shape

(19000, 2)

## Renaming the Values

In [38]:
training_set['sentiment'] = training_set['sentiment'].replace({'neg':0, 'neut':1, 'pos':2})
training_set['sentiment'].value_counts()

2    7000
0    7000
1    5000
Name: sentiment, dtype: int64

In [39]:
training_set.to_csv('final_training_set_emotion.csv',index=False)